diff --git a/.codecov.yml b/.codecov.yml
index 4e80d1cac..e67db52b9 100644
--- a/.codecov.yml
+++ b/.codecov.yml
@@ -12,6 +12,16 @@ codecov:
     after_n_builds: 1
     wait_for_ci: yes
 
+# Make coverage checks informational (report but never fail CI)
+coverage:
+  status:
+    project:
+      default:
+        informational: true
+    patch:
+      default:
+        informational: true
+
 # Change how pull request comments look
 comment:
   layout: "reach,diff,flags,files,footer"
diff --git a/.drone.star b/.drone.star
index fed009c4c..48278a358 100644
--- a/.drone.star
+++ b/.drone.star
@@ -21,7 +21,7 @@ def main(ctx):
     # and cmake-superproject (linux/latest gcc) by default
     jobs = generate(
         [
-            'gcc >=12.0',
+            'gcc >=13.0',
             'clang >=17.0',
             'msvc >=14.1',
             'arm64-gcc latest',
@@ -46,7 +46,26 @@ def main(ctx):
             environment={
                 'B2_TOOLSET': 'clang',
                 'B2_CXXSTD': '20',
-                'B2_CXXFLAGS': '-fexperimental-library',
+            },
+            globalenv=globalenv),
+
+        osx_cxx("macOS: Clang 26.2.0", "clang++", packages="",
+            buildscript="drone", buildtype="boost",
+            xcode_version="26.2.0",
+            environment={
+                'B2_TOOLSET': 'clang',
+                'B2_CXXSTD': '20',
+            },
+            globalenv=globalenv),
+    ]
+
+    jobs += [
+        freebsd_cxx("clang-22", "clang++-22",
+            buildscript="drone", buildtype="boost",
+            freebsd_version="15.0",
+            environment={
+                'B2_TOOLSET': 'clang-22',
+                'B2_CXXSTD': '20',
             },
             globalenv=globalenv),
     ]
diff --git a/.github/compilers.json b/.github/compilers.json
new file mode 100644
index 000000000..f0484482a
--- /dev/null
+++ b/.github/compilers.json
@@ -0,0 +1,153 @@
+{
+  "gcc": [
+    {
+      "version": "13",
+      "cxxstd": "20,23",
+      "latest_cxxstd": "23",
+      "runs_on": "ubuntu-24.04",
+      "cxx": "g++-13",
+      "cc": "gcc-13",
+      "b2_toolset": "gcc",
+      "arm": true,
+      "is_earliest": true
+    },
+    {
+      "version": "14",
+      "cxxstd": "20,23",
+      "latest_cxxstd": "23",
+      "runs_on": "ubuntu-24.04",
+      "cxx": "g++-14",
+      "cc": "gcc-14",
+      "b2_toolset": "gcc",
+      "arm": true
+    },
+    {
+      "version": "15",
+      "cxxstd": "20,23",
+      "latest_cxxstd": "23",
+      "runs_on": "ubuntu-24.04",
+      "container": "ubuntu:25.04",
+      "cxx": "g++-15",
+      "cc": "gcc-15",
+      "b2_toolset": "gcc",
+      "is_latest": true
+    }
+  ],
+  "clang": [
+    {
+      "version": "17",
+      "cxxstd": "20",
+      "latest_cxxstd": "20",
+      "runs_on": "ubuntu-24.04",
+      "cxx": "clang++-17",
+      "cc": "clang-17",
+      "b2_toolset": "clang",
+      "arm": true,
+      "is_earliest": true
+    },
+    {
+      "version": "18",
+      "cxxstd": "20,23",
+      "latest_cxxstd": "23",
+      "runs_on": "ubuntu-24.04",
+      "cxx": "clang++-18",
+      "cc": "clang-18",
+      "b2_toolset": "clang",
+      "arm": true
+    },
+    {
+      "version": "19",
+      "cxxstd": "20,23",
+      "latest_cxxstd": "23",
+      "runs_on": "ubuntu-24.04",
+      "cxx": "clang++-19",
+      "cc": "clang-19",
+      "b2_toolset": "clang",
+      "arm": true
+    },
+    {
+      "version": "20",
+      "cxxstd": "20,23",
+      "latest_cxxstd": "23",
+      "runs_on": "ubuntu-24.04",
+      "container": "ubuntu:24.04",
+      "cxx": "clang++-20",
+      "cc": "clang-20",
+      "b2_toolset": "clang",
+      "arm": true,
+      "is_latest": true,
+      "clang_tidy": true
+    }
+  ],
+  "msvc": [
+    {
+      "version": "14.34",
+      "cxxstd": "20",
+      "latest_cxxstd": "20",
+      "runs_on": "windows-2022",
+      "b2_toolset": "msvc-14.3",
+      "generator": "Visual Studio 17 2022",
+      "is_earliest": true
+    },
+    {
+      "version": "14.44",
+      "cxxstd": "20",
+      "latest_cxxstd": "20",
+      "runs_on": "windows-2022",
+      "b2_toolset": "msvc-14.4",
+      "generator": "Visual Studio 17 2022",
+      "is_latest": true
+    }
+  ],
+  "mingw": [
+    {
+      "version": "*",
+      "cxxstd": "20,23",
+      "latest_cxxstd": "23",
+      "runs_on": "windows-2022",
+      "cxx": "clang++",
+      "cc": "clang",
+      "b2_toolset": "clang",
+      "build_cmake": false,
+      "is_latest": true
+    }
+  ],
+  "clang-cl": [
+    {
+      "version": "*",
+      "cxxstd": "20",
+      "latest_cxxstd": "20",
+      "runs_on": "windows-2022",
+      "cxx": "clang++-cl",
+      "cc": "clang-cl",
+      "b2_toolset": "clang-win",
+      "generator_toolset": "ClangCL",
+      "build_cmake": false,
+      "is_latest": true
+    }
+  ],
+  "apple-clang": [
+    {
+      "version": "*",
+      "cxxstd": "20,23",
+      "latest_cxxstd": "23",
+      "runs_on": "macos-15",
+      "cxx": "clang++",
+      "cc": "clang",
+      "b2_toolset": "clang",
+      "cxxflags": "-fvisibility=hidden -fvisibility-inlines-hidden",
+      "is_earliest": true
+    },
+    {
+      "version": "*",
+      "cxxstd": "20,23",
+      "latest_cxxstd": "23",
+      "runs_on": "macos-26",
+      "cxx": "clang++",
+      "cc": "clang",
+      "b2_toolset": "clang",
+      "cxxflags": "-fvisibility=hidden -fvisibility-inlines-hidden",
+      "is_latest": true
+    }
+  ]
+}
diff --git a/.github/generate-matrix.py b/.github/generate-matrix.py
new file mode 100644
index 000000000..3bb01c99b
--- /dev/null
+++ b/.github/generate-matrix.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python3
+#
+# Copyright (c) 2026 Michael Vandeberg
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+"""
+Generate CI matrix JSON for GitHub Actions.
+
+Reads compilers.json and outputs a JSON array of matrix entries to stdout.
+Each entry has fields matching what the ci.yml build job expects.
+
+Usage:
+    python3 generate-matrix.py                    # JSON array
+    python3 generate-matrix.py | python3 -m json.tool  # pretty-printed
+"""
+
+import json
+import os
+import sys
+
+
+def load_compilers(path=None):
+    if path is None:
+        path = os.path.join(os.path.dirname(__file__), "compilers.json")
+    with open(path) as f:
+        return json.load(f)
+
+
+def platform_for_family(compiler_family):
+    """Return the platform name for a compiler family."""
+    if compiler_family in ("msvc", "clang-cl", "mingw"):
+        return "windows"
+    elif compiler_family == "apple-clang":
+        return "macos"
+    return "linux"
+
+
+def make_entry(compiler_family, spec, **overrides):
+    """Build a matrix entry dict from a compiler spec and optional overrides."""
+    entry = {
+        "compiler": compiler_family,
+        "version": spec["version"],
+        "cxxstd": spec["cxxstd"],
+        "latest-cxxstd": spec["latest_cxxstd"],
+        "runs-on": spec["runs_on"],
+        "b2-toolset": spec["b2_toolset"],
+        "shared": True,
+        "build-type": "Release",
+        platform_for_family(compiler_family): True,
+    }
+
+    if spec.get("container"):
+        entry["container"] = spec["container"]
+    if spec.get("cxx"):
+        entry["cxx"] = spec["cxx"]
+    if spec.get("cc"):
+        entry["cc"] = spec["cc"]
+    if spec.get("generator"):
+        entry["generator"] = spec["generator"]
+    if spec.get("generator_toolset"):
+        entry["generator-toolset"] = spec["generator_toolset"]
+    if spec.get("is_latest"):
+        entry["is-latest"] = True
+    if spec.get("is_earliest"):
+        entry["is-earliest"] = True
+    if "shared" in spec:
+        entry["shared"] = spec["shared"]
+    if spec.get("vcpkg_triplet"):
+        entry["vcpkg-triplet"] = spec["vcpkg_triplet"]
+
+    # CMake builds only on earliest/latest compilers, unless explicitly disabled
+    if spec.get("build_cmake") is False:
+        entry["build-cmake"] = False
+    elif spec.get("is_latest") or spec.get("is_earliest"):
+        entry["build-cmake"] = True
+    if spec.get("cmake_cxxstd"):
+        entry["cmake-cxxstd"] = spec["cmake_cxxstd"]
+    if spec.get("cxxflags"):
+        entry["cxxflags"] = spec["cxxflags"]
+
+    entry.update(overrides)
+    entry["name"] = generate_name(compiler_family, entry)
+    return entry
+
+
+def generate_name(compiler_family, entry):
+    """Generate a human-readable job name from entry fields."""
+    name_map = {
+        "gcc": "GCC",
+        "clang": "Clang",
+        "msvc": "MSVC",
+        "mingw": "MinGW Clang",
+        "clang-cl": "Clang-CL",
+        "apple-clang": "Apple-Clang",
+    }
+    prefix = name_map.get(compiler_family, compiler_family)
+
+    version = entry["version"]
+    if version != "*":
+        prefix = f"{prefix} {version}"
+
+    standards = entry["cxxstd"].split(",")
+    cxxstd = ",".join(f"C++{s}" for s in standards)
+
+    modifiers = []
+
+    runner = entry["runs-on"]
+    if "arm" in runner:
+        modifiers.append("arm64")
+    elif compiler_family == "apple-clang":
+        # Extract macOS version from runner name
+        macos_ver = runner.replace("macos-", "macOS ")
+        modifiers.append(macos_ver)
+
+    if entry.get("tsan"):
+        modifiers.append("tsan")
+    elif entry.get("asan") and entry.get("ubsan"):
+        modifiers.append("asan+ubsan")
+    elif entry.get("asan"):
+        modifiers.append("asan")
+    elif entry.get("ubsan"):
+        modifiers.append("ubsan")
+
+    if entry.get("coverage"):
+        modifiers.append("coverage")
+
+    if entry.get("x86"):
+        modifiers.append("x86")
+
+    if entry.get("clang-tidy"):
+        modifiers.append("clang-tidy")
+
+    if entry.get("time-trace"):
+        modifiers.append("time-trace")
+
+    if entry.get("superproject-cmake"):
+        modifiers.append("superproject CMake")
+
+    if entry.get("shared") is False:
+        modifiers.append("static")
+
+    suffix = f" ({', '.join(modifiers)})" if modifiers else ""
+    return f"{prefix}: {cxxstd}{suffix}"
+
+
+def generate_sanitizer_variant(compiler_family, spec):
+    """Generate ASAN+UBSAN variant for the latest compiler in a family.
+
+    MSVC does not support UBSAN; only ASAN is enabled for MSVC.
+    """
+    overrides = {
+        "asan": True,
+        "build-type": "RelWithDebInfo",
+        "shared": True,
+        "build-cmake": False,
+    }
+
+    # MSVC and Clang-CL only support ASAN, not UBSAN
+    if compiler_family not in ("msvc", "clang-cl"):
+        overrides["ubsan"] = True
+
+    if compiler_family == "clang":
+        overrides["shared"] = False
+
+    return make_entry(compiler_family, spec, **overrides)
+
+
+def generate_tsan_variant(compiler_family, spec):
+    """Generate TSan variant for the latest compiler in a family."""
+    overrides = {
+        "tsan": True,
+        "build-type": "RelWithDebInfo",
+        "shared": True,
+        "build-cmake": False,
+    }
+
+    if compiler_family in ("clang", "apple-clang"):
+        overrides["shared"] = False
+
+    return make_entry(compiler_family, spec, **overrides)
+
+
+def generate_coverage_variant(compiler_family, spec):
+    """Generate coverage variant with platform-specific flags.
+
+    Linux/Windows: full gcov flags with atomic profile updates.
+    macOS: --coverage only (Apple-Clang uses llvm-cov).
+    """
+    platform = platform_for_family(compiler_family)
+
+    if platform == "macos":
+        cov_flags = "--coverage"
+    else:
+        cov_flags = ("--coverage -fprofile-arcs -ftest-coverage"
+                     " -fprofile-update=atomic")
+
+    overrides = {
+        "coverage": True,
+        "coverage-flag": platform,
+        "shared": False,
+        "build-type": "Debug",
+        "build-cmake": False,
+        "cxxflags": cov_flags,
+        "ccflags": cov_flags,
+    }
+
+    if platform == "linux":
+        overrides["install"] = "lcov wget unzip"
+
+    entry = make_entry(compiler_family, spec, **overrides)
+    entry.pop("is-latest", None)
+    entry.pop("is-earliest", None)
+    return entry
+
+
+def generate_x86_variant(compiler_family, spec):
+    """Generate x86 (32-bit) variant (Clang only)."""
+    return make_entry(compiler_family, spec,
+        x86=True,
+        shared=False,
+        install="gcc-multilib g++-multilib")
+
+
+def generate_arm_entry(compiler_family, spec):
+    """Generate ARM64 variant for a compiler spec."""
+    arm_runner = spec["runs_on"].replace("ubuntu-24.04", "ubuntu-24.04-arm")
+    # ARM runners don't support containers — build a spec copy without container
+    arm_spec = {k: v for k, v in spec.items() if k != "container"}
+    arm_spec["runs_on"] = arm_runner
+    return make_entry(compiler_family, arm_spec)
+
+
+def generate_time_trace_variant(compiler_family, spec):
+    """Generate time-trace variant for compile-time profiling (Clang only)."""
+    return make_entry(compiler_family, spec, **{
+        "time-trace": True,
+        "build-cmake": True,
+        "cxxflags": "-ftime-trace",
+    })
+
+
+def generate_superproject_cmake_variant(compiler_family, spec):
+    """Generate a single superproject CMake build to verify integration."""
+    entry = make_entry(compiler_family, spec, **{
+        "superproject-cmake": True,
+        "build-cmake": False,
+    })
+    entry.pop("is-latest", None)
+    entry.pop("is-earliest", None)
+    return entry
+
+
+def apply_clang_tidy(entry, spec):
+    """Add clang-tidy flag and install package to an entry."""
+    entry["clang-tidy"] = True
+    entry["build-cmake"] = False
+    version = spec["version"]
+    existing_install = entry.get("install", "")
+    tidy_pkg = f"clang-tidy-{version}"
+    entry["install"] = f"{existing_install} {tidy_pkg}".strip()
+    entry["name"] = generate_name(entry["compiler"], entry)
+    return entry
+
+
+def main():
+    compilers = load_compilers()
+    matrix = []
+
+    for family, specs in compilers.items():
+        for spec in specs:
+            # Base entry (x86_64 / default arch)
+            base = make_entry(family, spec)
+            if spec.get("clang_tidy"):
+                apply_clang_tidy(base, spec)
+            matrix.append(base)
+
+            # ARM entry if supported
+            if spec.get("arm"):
+                matrix.append(generate_arm_entry(family, spec))
+
+            # Variants for the latest compiler in each family
+            if spec.get("is_latest"):
+                if family != "mingw":
+                    matrix.append(generate_sanitizer_variant(family, spec))
+
+                # TSan is incompatible with ASan; separate variant for Linux
+                if family in ("gcc", "clang", "apple-clang"):
+                    matrix.append(generate_tsan_variant(family, spec))
+
+                # GCC always gets coverage; other families opt in via spec flag
+                if family == "gcc" or spec.get("coverage"):
+                    matrix.append(generate_coverage_variant(family, spec))
+
+                if family == "gcc":
+                    matrix.append(generate_superproject_cmake_variant(family, spec))
+
+                if family == "clang":
+                    matrix.append(generate_x86_variant(family, spec))
+                    matrix.append(generate_time_trace_variant(family, spec))
+
+    json.dump(matrix, sys.stdout)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 769d04cae..1851cba1a 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,4 +1,5 @@
 #
+# Copyright (c) 2026 Michael Vandeberg
 # Copyright (c) 2023 Christian Mazakas
 # Copyright (c) 2023 Alan de Freitas
 # Copyright (c) 2021-2023 Sam Darwin
@@ -38,180 +39,27 @@ env:
   TZ: "Europe/London"
 
 jobs:
+  generate-matrix:
+    name: Generate Matrix
+    runs-on: ubuntu-24.04
+    outputs:
+      matrix: ${{ steps.generate.outputs.matrix }}
+    steps:
+      - uses: actions/checkout@v4
+      - id: generate
+        run: |
+          matrix=$(python3 .github/generate-matrix.py)
+          echo "matrix={\"include\":$matrix}" >> "$GITHUB_OUTPUT"
+
   build:
+    needs: [generate-matrix]
     defaults:
       run:
         shell: bash
 
     strategy:
       fail-fast: false
-      matrix:
-        include:
-          # Windows (3 configurations)
-
-          - compiler: "msvc"
-            version: "14.42"
-            cxxstd: "20"
-            latest-cxxstd: "20"
-            runs-on: "windows-2022"
-            b2-toolset: "msvc-14.4"
-            generator: "Visual Studio 17 2022"
-            is-latest: true
-            name: "MSVC 14.42: C++20"
-            shared: false
-            build-type: "Release"
-            build-cmake: true
-
-          - compiler: "msvc"
-            version: "14.34"
-            cxxstd: "20"
-            latest-cxxstd: "20"
-            runs-on: "windows-2022"
-            b2-toolset: "msvc-14.3"
-            generator: "Visual Studio 17 2022"
-            name: "MSVC 14.34: C++20 (shared)"
-            shared: true
-            build-type: "Release"
-
-          # macOS (2 configurations)
-
-          - compiler: "apple-clang"
-            version: "*"
-            cxxstd: "20"
-            latest-cxxstd: "20"
-            cxx: "clang++"
-            cc: "clang"
-            runs-on: "macos-26"
-            b2-toolset: "clang"
-            is-latest: true
-            name: "Apple-Clang (macOS 26, asan+ubsan): C++20"
-            shared: true
-            build-type: "RelWithDebInfo"
-            asan: true
-            ubsan: true
-
-          # Linux GCC (4 configurations)
-
-          - compiler: "gcc"
-            version: "15"
-            cxxstd: "20"
-            latest-cxxstd: "20"
-            cxx: "g++-15"
-            cc: "gcc-15"
-            runs-on: "ubuntu-latest"
-            container: "ubuntu:25.04"
-            b2-toolset: "gcc"
-            is-latest: true
-            name: "GCC 15: C++20"
-            shared: true
-            build-type: "Release"
-            build-cmake: true
-
-          - compiler: "gcc"
-            version: "15"
-            cxxstd: "20"
-            latest-cxxstd: "20"
-            cxx: "g++-15"
-            cc: "gcc-15"
-            runs-on: "ubuntu-latest"
-            container: "ubuntu:25.04"
-            b2-toolset: "gcc"
-            is-latest: true
-            name: "GCC 15: C++20 (asan+ubsan)"
-            shared: true
-            asan: true
-            ubsan: true
-            build-type: "RelWithDebInfo"
-
-          - compiler: "gcc"
-            version: "12"
-            cxxstd: "20"
-            latest-cxxstd: "20"
-            cxx: "g++-12"
-            cc: "gcc-12"
-            runs-on: "ubuntu-latest"
-            container: "ubuntu:22.04"
-            b2-toolset: "gcc"
-            name: "GCC 12: C++20"
-            shared: true
-            build-type: "Release"
-
-          - compiler: "gcc"
-            version: "13"
-            cxxstd: "20"
-            latest-cxxstd: "20"
-            cxx: "g++-13"
-            cc: "gcc-13"
-            runs-on: "ubuntu-24.04"
-            b2-toolset: "gcc"
-            name: "GCC 13: C++20 (coverage)"
-            shared: false
-            coverage: true
-            build-type: "Debug"
-            cxxflags: "--coverage -fprofile-arcs -ftest-coverage"
-            ccflags: "--coverage -fprofile-arcs -ftest-coverage"
-            install: "lcov wget unzip"
-
-          # Linux Clang (5 configurations)
-
-          - compiler: "clang"
-            version: "20"
-            cxxstd: "20,23"
-            latest-cxxstd: "23"
-            cxx: "clang++-20"
-            cc: "clang-20"
-            runs-on: "ubuntu-latest"
-            container: "ubuntu:24.04"
-            b2-toolset: "clang"
-            is-latest: true
-            name: "Clang 20: C++20-23"
-            shared: true
-            build-type: "Release"
-            build-cmake: true
-
-          - compiler: "clang"
-            version: "20"
-            cxxstd: "20"
-            latest-cxxstd: "20"
-            cxx: "clang++-20"
-            cc: "clang-20"
-            runs-on: "ubuntu-latest"
-            container: "ubuntu:24.04"
-            b2-toolset: "clang"
-            is-latest: true
-            name: "Clang 20: C++20 (asan+ubsan)"
-            shared: false
-            asan: true
-            ubsan: true
-            build-type: "RelWithDebInfo"
-
-          - compiler: "clang"
-            version: "17"
-            cxxstd: "20"
-            latest-cxxstd: "20"
-            cxx: "clang++-17"
-            cc: "clang-17"
-            runs-on: "ubuntu-24.04"
-            b2-toolset: "clang"
-            name: "Clang 17: C++20"
-            shared: false
-            build-type: "Release"
-
-          - compiler: "clang"
-            version: "20"
-            cxxstd: "20,23"
-            latest-cxxstd: "23"
-            cxx: "clang++-20"
-            cc: "clang-20"
-            runs-on: "ubuntu-latest"
-            container: "ubuntu:24.04"
-            b2-toolset: "clang"
-            is-latest: true
-            name: "Clang 20: C++20-23 (x86)"
-            shared: false
-            x86: true
-            build-type: "Release"
-            install: "gcc-multilib g++-multilib"
+      matrix: ${{ fromJSON(needs.generate-matrix.outputs.matrix) }}
 
     name: ${{ matrix.name }}
     runs-on: ${{ matrix.runs-on }}
@@ -244,6 +92,14 @@ jobs:
           apt-get: >-
             ${{ matrix.install }}
             build-essential
+            curl
+
+      - name: Setup MSYS2 (MinGW Clang)
+        if: matrix.compiler == 'mingw'
+        shell: bash
+        run: |
+          /c/msys64/usr/bin/pacman.exe -S --noconfirm mingw-w64-clang-x86_64-clang
+          echo "C:/msys64/clang64/bin" >> "$GITHUB_PATH"
 
       - name: Clone Boost
         uses: alandefreitas/cpp-actions/boost-clone@v1.9.0
@@ -265,9 +121,6 @@ jobs:
         shell: bash
         run: |
           set -xe
-          pwd
-          ls
-          ls -lah boost-source
 
           # Identify boost module being tested
           module=${GITHUB_REPOSITORY#*/}
@@ -280,6 +133,18 @@ jobs:
           # Remove module from boost-source
           rm -r "boost-source/libs/$module" || true
 
+          # Disable sparse checkout for superproject CMake builds
+          # (needed so CMakeLists.txt files in sibling boost libraries are available)
+          if [ "${{ matrix.superproject-cmake }}" = "true" ]; then
+            cd boost-source
+            if git sparse-checkout list > /dev/null 2>&1; then
+                git sparse-checkout disable
+                git fetch origin --no-tags
+                git checkout
+            fi
+            cd ..
+          fi
+
           # Copy cached boost-source to an isolated boost-root
           cp -rL boost-source boost-root
 
@@ -294,84 +159,295 @@ jobs:
 
       - name: Boost B2 Workflow
         uses: alandefreitas/cpp-actions/b2-workflow@v1.9.0
-        if: ${{ !matrix.coverage && !matrix.time-trace }}
+        if: ${{ !matrix.coverage && !matrix.time-trace && !matrix.superproject-cmake && !matrix.clang-tidy }}
         env:
-          ASAN_OPTIONS: ${{ ((matrix.compiler == 'apple-clang' || matrix.compiler == 'clang') && 'detect_invalid_pointer_pairs=0:strict_string_checks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1') || 'detect_invalid_pointer_pairs=2:strict_string_checks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1' }}
+          ASAN_OPTIONS: ${{ ((matrix.compiler == 'apple-clang' || matrix.compiler == 'clang' || matrix.compiler == 'mingw') && 'detect_invalid_pointer_pairs=0:strict_string_checks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1') || 'detect_invalid_pointer_pairs=2:strict_string_checks=1:detect_stack_use_after_return=1:check_initialization_order=1:strict_init_order=1' }}
+          TSAN_OPTIONS: ${{ matrix.tsan && 'halt_on_error=1:second_deadlock_stack=1' || '' }}
         with:
           source-dir: boost-root
           modules: capy
           toolset: ${{ matrix.b2-toolset }}
           build-variant: ${{ (matrix.compiler == 'msvc' && 'debug,release') || matrix.build-type }}
-          cxx: ${{ steps.setup-cpp.outputs.cxx || matrix.cxx || '' }}
+          cxx: ${{ matrix.cxx || steps.setup-cpp.outputs.cxx || '' }}
           cxxstd: ${{ matrix.cxxstd }}
           address-model: ${{ (matrix.x86 && '32') || '64' }}
           asan: ${{ matrix.asan }}
           ubsan: ${{ matrix.ubsan }}
+          tsan: ${{ matrix.tsan }}
           shared: ${{ matrix.shared }}
           rtti: on
-          cxxflags: ${{ (matrix.asan && '-fsanitize-address-use-after-scope -fsanitize=pointer-subtract') || '' }}
+          cxxflags: ${{ (matrix.asan && matrix.compiler != 'msvc' && matrix.compiler != 'clang-cl' && '-fsanitize-address-use-after-scope -fsanitize=pointer-subtract') || '' }}
           stop-on-error: true
           extra-args: ${{ (matrix.valgrind && 'testing.launcher=valgrind' || '' )}}
 
-      - name: CMake Workflow
+      - name: Boost CMake Workflow
         uses: alandefreitas/cpp-actions/cmake-workflow@v1.9.0
-        if: ${{ matrix.coverage || matrix.time-trace || matrix.build-cmake || matrix.is-earliest }}
+        if: ${{ matrix.superproject-cmake }}
         with:
-          source-dir: capy-root
-          build-dir: __build__
+          source-dir: boost-root
+          build-dir: __build_cmake_test__
           generator: ${{ matrix.generator }}
           generator-toolset: ${{ matrix.generator-toolset }}
           build-type: ${{ matrix.build-type }}
           build-target: boost_capy_tests
           run-tests: true
-          cxxstd: ${{ matrix.latest-cxxstd }}
+          cxxstd: ${{ matrix.cmake-cxxstd || matrix.cxxstd }}
           cc:  ${{ steps.setup-cpp.outputs.cc || matrix.cc }}
           ccflags: ${{ matrix.ccflags }}
           cxx: ${{ steps.setup-cpp.outputs.cxx || matrix.cxx }}
           cxxflags: ${{ matrix.cxxflags }}
           shared: ${{ matrix.shared }}
           cmake-version: '>=3.20'
-          install: false
+          install: true
+          install-prefix: ${{ steps.patch.outputs.workspace_root }}/.local
           package: false
           package-artifact: false
+          ref-source-dir: boost-root/libs/capy
+          extra-args: -D BOOST_INCLUDE_LIBRARIES=capy
+
+      - name: Set Path (Windows Shared)
+        if: ${{ matrix.windows && matrix.shared }}
+        run: echo "$GITHUB_WORKSPACE/.local/bin" >> $GITHUB_PATH
+
+      - name: Set LD_LIBRARY_PATH (Linux Shared)
+        if: ${{ matrix.linux && matrix.shared }}
+        run: |
+          echo "LD_LIBRARY_PATH=$GITHUB_WORKSPACE/.local/lib:$LD_LIBRARY_PATH" >> "$GITHUB_ENV"
+
+      - name: Find Package Integration Test
+        uses: alandefreitas/cpp-actions/cmake-workflow@v1.9.0
+        if: ${{ matrix.superproject-cmake }}
+        with:
+          source-dir: boost-root/libs/${{ steps.patch.outputs.module }}/test/cmake_test
+          build-dir: __build_cmake_install_test__
+          generator: ${{ matrix.generator }}
+          generator-toolset: ${{ matrix.generator-toolset }}
+          build-type: ${{ matrix.build-type }}
+          cxxstd: ${{ matrix.latest-cxxstd }}
+          cc: ${{ steps.setup-cpp.outputs.cc || matrix.cc }}
+          cxx: ${{ steps.setup-cpp.outputs.cxx || matrix.cxx }}
+          install: false
+          cmake-version: '>=3.20'
+          extra-args: |
+            -D BOOST_CI_INSTALL_TEST=ON
+            -D CMAKE_PREFIX_PATH=${{ steps.patch.outputs.workspace_root }}/.local
+          ref-source-dir: boost-root/libs/capy
+
+      - name: Subdirectory Integration Test
+        uses: alandefreitas/cpp-actions/cmake-workflow@v1.9.0
+        if: ${{ matrix.superproject-cmake }}
+        with:
+          source-dir: boost-root/libs/${{ steps.patch.outputs.module }}/test/cmake_test
+          build-dir: __build_cmake_subdir_test__
+          generator: ${{ matrix.generator }}
+          generator-toolset: ${{ matrix.generator-toolset }}
+          build-type: ${{ matrix.build-type }}
+          cxxstd: ${{ matrix.latest-cxxstd }}
+          cc: ${{ steps.setup-cpp.outputs.cc || matrix.cc }}
+          cxx: ${{ steps.setup-cpp.outputs.cxx || matrix.cxx }}
+          install: false
+          cmake-version: '>=3.20'
+          extra-args: -D BOOST_CI_INSTALL_TEST=OFF
+          ref-source-dir: boost-root/libs/capy/test/cmake_test
+
+      - name: Root Project CMake Workflow
+        uses: alandefreitas/cpp-actions/cmake-workflow@v1.9.0
+        if: ${{ matrix.build-cmake || matrix.coverage }}
+        with:
+          source-dir: capy-root
+          build-dir: __build__
+          generator: ${{ matrix.generator }}
+          generator-toolset: ${{ matrix.generator-toolset }}
+          build-type: ${{ matrix.build-type }}
+          run-tests: true
+          cxxstd: ${{ matrix.cmake-cxxstd || matrix.cxxstd }}
+          cc:  ${{ steps.setup-cpp.outputs.cc || matrix.cc }}
+          ccflags: ${{ matrix.ccflags }}
+          cxx: ${{ steps.setup-cpp.outputs.cxx || matrix.cxx }}
+          cxxflags: ${{ matrix.cxxflags }}
+          shared: ${{ matrix.shared }}
+          cmake-version: '>=3.20'
+          install: false
+          ref-source-dir: capy-root
+
+      - name: Configure for clang-tidy
+        uses: alandefreitas/cpp-actions/cmake-workflow@v1.9.0
+        if: ${{ matrix.clang-tidy }}
+        with:
+          source-dir: capy-root
+          build-dir: __build__
+          generator: ${{ matrix.generator }}
+          cxxstd: ${{ matrix.latest-cxxstd }}
+          cc: ${{ steps.setup-cpp.outputs.cc || matrix.cc }}
+          cxx: ${{ steps.setup-cpp.outputs.cxx || matrix.cxx }}
+          cmake-version: '>=3.20'
+          extra-args: -D CMAKE_EXPORT_COMPILE_COMMANDS=ON
+          build: false
+          run-tests: false
+          install: false
           ref-source-dir: capy-root
 
+      - name: Run clang-tidy
+        if: ${{ matrix.clang-tidy }}
+        run: |
+          python3 -c "import json; [print(e['file']) for e in json.load(open('capy-root/__build__/compile_commands.json'))]" \
+            | grep '/capy-root/\(src\|include\)/' \
+            | xargs -r clang-tidy-${{ matrix.version }} \
+                -p capy-root/__build__ \
+                --warnings-as-errors='*'
+
       - name: FlameGraph
         uses: alandefreitas/cpp-actions/flamegraph@v1.9.0
         if: matrix.time-trace
         with:
           source-dir: capy-root
-          build-dir: __build__
+          build-dir: capy-root/__build__
           github_token: ${{ secrets.GITHUB_TOKEN }}
 
-      - name: Codecov
-        if: ${{ matrix.coverage }}
+      - name: Generate Coverage Report
+        if: ${{ matrix.coverage && matrix.linux }}
         run: |
           set -x
-
-          # Generate report
           gcov_tool="gcov"
-          if command -v "gcov-${{ steps.setup-cpp.outputs.version-major }}.${{ steps.setup-cpp.outputs.version-minor }}" &> /dev/null; then
-              gcov_tool="gcov"
-          elif command -v "gcov-${{ steps.setup-cpp.outputs.version-major }}" &> /dev/null; then
+          if command -v "gcov-${{ steps.setup-cpp.outputs.version-major }}" &> /dev/null; then
               gcov_tool="gcov-${{ steps.setup-cpp.outputs.version-major }}"
           fi
-          lcov -c -q -o "capy-root/__build__/coverage.info" -d "capy-root/__build__" --include "$(pwd)/capy-root/*" --gcov-tool "$gcov_tool"
+          lcov -c -q -o "capy-root/__build__/coverage.info" -d "capy-root/__build__" \
+            --include "$(pwd)/capy-root/include/*" \
+            --include "$(pwd)/capy-root/src/*" \
+            --gcov-tool "$gcov_tool"
 
-          # Upload to codecov
-          bash <(curl -s https://codecov.io/bash) -f "capy-root/__build__/coverage.info"
-
-          # Summary
+      - name: Generate Coverage Report (macOS)
+        if: ${{ matrix.coverage && matrix.macos }}
+        run: |
+          pip3 install --break-system-packages gcovr
+          gcovr \
+            --gcov-executable "xcrun llvm-cov gcov" \
+            -r capy-root \
+            --filter ".*/capy-root/include/.*" \
+            --filter ".*/capy-root/src/.*" \
+            --lcov -o "capy-root/__build__/coverage.info"
+
+      - name: Generate Coverage Report (Windows)
+        if: ${{ matrix.coverage && matrix.windows }}
+        run: |
+          pip3 install gcovr
+          gcovr \
+            -r capy-root \
+            --filter ".*/capy-root/include/.*" \
+            --filter ".*/capy-root/src/.*" \
+            --lcov -o "capy-root/__build__/coverage.info"
+
+      - name: Upload to Codecov
+        if: ${{ matrix.coverage }}
+        uses: codecov/codecov-action@v5
+        with:
+          files: capy-root/__build__/coverage.info
+          flags: ${{ matrix.coverage-flag }}
+          token: ${{ secrets.CODECOV_TOKEN }}
+          fail_ci_if_error: false
+          verbose: true
+          disable_search: true
+          plugins: noop
+
+      - name: Coverage Summary
+        if: ${{ matrix.coverage }}
+        run: |
           echo "# Coverage" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "[![codecov](https://codecov.io/github/$GITHUB_REPOSITORY/commit/$GITHUB_SHA/graphs/sunburst.svg)](https://codecov.io/github/$GITHUB_REPOSITORY/commit/$GITHUB_SHA)" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
-          echo "" >> $GITHUB_STEP_SUMMARY
           echo "Commit: [![codecov](https://codecov.io/github/$GITHUB_REPOSITORY/commit/$GITHUB_SHA/graph/badge.svg)](https://codecov.io/github/$GITHUB_REPOSITORY/commit/$GITHUB_SHA)" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
           echo "Branch: [![codecov](https://codecov.io/github/$GITHUB_REPOSITORY/branch/$GITHUB_REF_NAME/graph/badge.svg)](https://codecov.io/github/$GITHUB_REPOSITORY/commit/$GITHUB_SHA)" >> $GITHUB_STEP_SUMMARY
           echo "" >> $GITHUB_STEP_SUMMARY
 
+  freebsd:
+    defaults:
+      run:
+        shell: bash
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - version: "14.3"
+            name: "FreeBSD 14.3 (Clang): C++20"
+          - version: "15.0"
+            name: "FreeBSD 15.0 (Clang): C++20"
+            build-cmake: true
+    
+    name: ${{ matrix.name }}
+    runs-on: ubuntu-24.04
+    timeout-minutes: 120
+
+    steps:
+      - name: Clone Boost.Capy
+        uses: actions/checkout@v4
+        with:
+          path: capy-root
+
+      - name: Clone Boost
+        uses: alandefreitas/cpp-actions/boost-clone@v1.9.0
+        id: boost-clone
+        with:
+          branch: ${{ (github.ref_name == 'master' && github.ref_name) || 'develop' }}
+          boost-dir: boost-source
+          modules-exclude-paths: ''
+          scan-modules-dir: capy-root
+          scan-modules-ignore: capy
+
+      - name: Patch Boost
+        id: patch
+        run: |
+          set -xe
+          module=${GITHUB_REPOSITORY#*/}
+          echo "module=$module" >> $GITHUB_OUTPUT
+          workspace_root=$(echo "$GITHUB_WORKSPACE" | sed 's/\\/\//g')
+
+          rm -r "boost-source/libs/$module" || true
+          cp -rL boost-source boost-root
+          cd boost-root
+          boost_root="$(pwd)"
+          echo -E "boost_root=$boost_root" >> $GITHUB_OUTPUT
+          cp -r "$workspace_root"/capy-root "libs/$module"
+
+      - name: Boost B2 Workflow (FreeBSD)
+        uses: vmactions/freebsd-vm@v1
+        with:
+          release: ${{ matrix.version }}
+          usesh: true
+          run: |
+            set -xe
+            cd boost-root
+            ./bootstrap.sh
+            ./b2 libs/${{ steps.patch.outputs.module }}/test \
+              toolset=clang \
+              cxxstd=20 \
+              variant=release \
+              link=shared \
+              rtti=on \
+              -q \
+              -j$(sysctl -n hw.ncpu)
+
+      - name: Boost CMake Workflow (FreeBSD)
+        if: ${{ matrix.build-cmake }}
+        uses: vmactions/freebsd-vm@v1
+        with:
+          release: ${{ matrix.version }}
+          usesh: true
+          prepare: |
+            pkg install -y cmake
+          run: |
+            set -xe
+            cd boost-root
+            cmake -S . -B build \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DBOOST_INCLUDE_LIBRARIES="${{ steps.patch.outputs.module }}" \
+              -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+            cmake --build build --target tests -j$(sysctl -n hw.ncpu)
+            ctest --test-dir build --output-on-failure
+
   changelog:
     defaults:
       run:
diff --git a/.gitignore b/.gitignore
index a829fe81a..ea42cf23c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 /.vs
+.cache/
 /build-*/
 /build/*
 !/build/Jamfile
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 033c3c2bf..d099480c4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,6 +3,8 @@
 # Copyright (c) 2021 Dmitry Arkhipov (grisumbras@gmail.com)
 # Copyright (c) 2022 Alan de Freitas (alandefreitas@gmail.com)
 # Copyright (c) 2025 Mohammad Nejati
+# Copyright (c) 2026 Steve Gerbino
+# Copyright (c) 2026 Michael Vandeberg
 #
 # Distributed under the Boost Software License, Version 1.0. (See accompanying
 # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -10,44 +12,80 @@
 # Official repository: https://github.com/cppalliance/capy
 #
 
-#-------------------------------------------------
-#
-# Project
-#
-#-------------------------------------------------
-
-cmake_minimum_required(VERSION 3.8...3.31)
+cmake_minimum_required(VERSION 3.13...3.31)
 
 set(BOOST_CAPY_VERSION 1)
 if (BOOST_SUPERPROJECT_VERSION)
     set(BOOST_CAPY_VERSION ${BOOST_SUPERPROJECT_VERSION})
 endif ()
 project(boost_capy VERSION "${BOOST_CAPY_VERSION}" LANGUAGES CXX)
-set(BOOST_CAPY_IS_ROOT OFF)
-if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
+if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
     set(BOOST_CAPY_IS_ROOT ON)
-endif ()
+    include(CTest)
+else()
+    set(BOOST_CAPY_IS_ROOT OFF)
+endif()
 set(__ignore__ ${CMAKE_C_COMPILER})
 
-#-------------------------------------------------
-#
-# Options
-#
-#-------------------------------------------------
-if (BOOST_CAPY_IS_ROOT)
-    include(CTest)
-endif ()
+# FreeBSD and macOS ship libc++ without full std::stop_token support;
+# -fexperimental-library enables it and links libc++experimental.
+include(CheckCXXCompilerFlag)
+if((APPLE OR CMAKE_SYSTEM_NAME STREQUAL "FreeBSD") AND
+    CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    check_cxx_compiler_flag(-fexperimental-library
+        BOOST_CAPY_HAS_EXPERIMENTAL_LIBRARY)
+endif()
+
 option(BOOST_CAPY_BUILD_TESTS "Build boost::capy tests" ${BUILD_TESTING})
 option(BOOST_CAPY_BUILD_EXAMPLES "Build boost::capy examples" ${BOOST_CAPY_IS_ROOT})
+option(BOOST_CAPY_BUILD_BENCH "Build boost::capy benchmarks" ${BOOST_CAPY_IS_ROOT})
+option(BOOST_CAPY_BUILD_P2300_EXAMPLES "Build examples that depend on beman-execution (P2300)" OFF)
+option(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES "Build examples that depend on NVIDIA nvexec (CUDA)" OFF)
+option(BOOST_CAPY_BUILD_CUDA_EXAMPLES "Build examples that depend only on CUDA (no stdexec/nvexec)" OFF)
 option(BOOST_CAPY_MRDOCS_BUILD "Build the target for MrDocs: see mrdocs.yml" OFF)
 
-#-------------------------------------------------
-#
-# Library
-#
-#-------------------------------------------------
+if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
+    if(NOT DEFINED CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD LESS 23)
+        message(FATAL_ERROR
+            "BOOST_CAPY_BUILD_P2300_EXAMPLES requires CMAKE_CXX_STANDARD >= 23")
+    endif()
+endif()
+
+if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
+    if(NOT BOOST_CAPY_BUILD_STDEXEC_EXAMPLES)
+        message(FATAL_ERROR
+            "BOOST_CAPY_BUILD_NVEXEC_EXAMPLES requires "
+            "BOOST_CAPY_BUILD_STDEXEC_EXAMPLES=ON")
+    endif()
+    if(NOT DEFINED CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD LESS 23)
+        message(FATAL_ERROR
+            "BOOST_CAPY_BUILD_NVEXEC_EXAMPLES requires CMAKE_CXX_STANDARD >= 23")
+    endif()
+    # Tell NVIDIA/stdexec to build the nvexec target when its
+    # FetchContent is processed (bench/ and/or the example itself).
+    set(STDEXEC_ENABLE_CUDA ON CACHE BOOL
+        "Build nvexec when configuring NVIDIA/stdexec" FORCE)
+endif()
+
+if(BOOST_CAPY_BUILD_CUDA_EXAMPLES)
+    if(NOT DEFINED CMAKE_CXX_STANDARD OR CMAKE_CXX_STANDARD LESS 20)
+        message(FATAL_ERROR
+            "BOOST_CAPY_BUILD_CUDA_EXAMPLES requires CMAKE_CXX_STANDARD >= 20")
+    endif()
+endif()
+
+# Enable the CUDA language once for whichever CUDA example set is requested.
+if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES OR BOOST_CAPY_BUILD_CUDA_EXAMPLES)
+    enable_language(CUDA)
+    find_package(CUDAToolkit REQUIRED)
+endif()
+
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 
+if(BOOST_CAPY_IS_ROOT AND BUILD_SHARED_LIBS)
+    set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
+endif()
+
 file(GLOB_RECURSE BOOST_CAPY_HEADERS CONFIGURE_DEPENDS include/boost/*.hpp include/boost/*.natvis)
 file(GLOB_RECURSE BOOST_CAPY_SOURCES CONFIGURE_DEPENDS src/*.cpp src/*.hpp)
 
@@ -55,17 +93,23 @@ source_group("" FILES "include/boost/capy.hpp" "build/Jamfile")
 source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR}/include/boost/capy PREFIX "include" FILES ${BOOST_CAPY_HEADERS})
 source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR}/src PREFIX "src" FILES ${BOOST_CAPY_SOURCES})
 
+find_package(Threads REQUIRED)
+
 function(boost_capy_setup_properties target)
     target_compile_features(${target} PUBLIC cxx_std_20)
-    target_include_directories(${target} PUBLIC "${PROJECT_SOURCE_DIR}/include")
-    target_include_directories(${target} PRIVATE "${PROJECT_SOURCE_DIR}")
+    target_include_directories(${target} PUBLIC
+        $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>)
+    target_include_directories(${target} PRIVATE
+        $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}>)
     target_compile_definitions(${target} PUBLIC BOOST_CAPY_NO_LIB)
     target_compile_definitions(${target} PRIVATE BOOST_CAPY_SOURCE)
-    if (BUILD_SHARED_LIBS)
-        target_compile_definitions(${target} PUBLIC BOOST_CAPY_DYN_LINK)
-    else ()
-        target_compile_definitions(${target} PUBLIC BOOST_CAPY_STATIC_LINK)
-    endif ()
+    target_link_libraries(${target} PUBLIC Threads::Threads)
+    if(BOOST_CAPY_HAS_EXPERIMENTAL_LIBRARY)
+        target_compile_options(${target} PUBLIC -fexperimental-library)
+        target_link_options(${target} PUBLIC -fexperimental-library)
+    endif()
+    target_compile_definitions(${target} PUBLIC
+        $<IF:$<BOOL:${BUILD_SHARED_LIBS}>,BOOST_CAPY_DYN_LINK,BOOST_CAPY_STATIC_LINK>)
 endfunction()
 
 if (BOOST_CAPY_MRDOCS_BUILD)
@@ -84,32 +128,71 @@ boost_capy_setup_properties(boost_capy)
 
 # Disable IPO/LTCG - causes LNK2016 errors with MSVC
 set_target_properties(boost_capy PROPERTIES
+    EXPORT_NAME capy
     INTERPROCEDURAL_OPTIMIZATION OFF
     INTERPROCEDURAL_OPTIMIZATION_RELEASE OFF
     INTERPROCEDURAL_OPTIMIZATION_RELWITHDEBINFO OFF
     INTERPROCEDURAL_OPTIMIZATION_MINSIZEREL OFF)
 
-#-------------------------------------------------
-#
-# Benchmarks
-#
-#-------------------------------------------------
-add_subdirectory(bench)
+include(GNUInstallDirs)
+
+add_subdirectory(extra/test_suite)
+
+if(BOOST_SUPERPROJECT_VERSION AND NOT CMAKE_VERSION VERSION_LESS 3.13)
+    boost_install(
+        TARGETS boost_capy
+        VERSION ${BOOST_SUPERPROJECT_VERSION}
+        HEADER_DIRECTORY include)
+else()
+    include(CMakePackageConfigHelpers)
+
+    # Set INSTALL_INTERFACE for standalone installs (boost_install handles
+    # this for superproject builds, including versioned-layout paths)
+    target_include_directories(boost_capy PUBLIC
+        $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+
+    set(BOOST_CAPY_INSTALL_CMAKEDIR
+        ${CMAKE_INSTALL_LIBDIR}/cmake/boost_capy)
+
+    install(TARGETS boost_capy boost_capy_test_suite boost_capy_test_suite_main
+        EXPORT boost_capy-targets
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
+    install(DIRECTORY include/
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+    install(FILES extra/test_suite/test_suite.hpp
+        DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/boost/capy/extra/test_suite)
+    install(FILES
+        extra/test_suite/DiscoverTests.cmake
+        extra/test_suite/DiscoverAndWriteTestsScripts.cmake
+        DESTINATION ${BOOST_CAPY_INSTALL_CMAKEDIR})
+    install(EXPORT boost_capy-targets
+        NAMESPACE Boost::
+        DESTINATION ${BOOST_CAPY_INSTALL_CMAKEDIR})
+
+    configure_package_config_file(
+        cmake/boost_capy-config.cmake.in
+        ${CMAKE_CURRENT_BINARY_DIR}/boost_capy-config.cmake
+        INSTALL_DESTINATION ${BOOST_CAPY_INSTALL_CMAKEDIR})
+    write_basic_package_version_file(
+        ${CMAKE_CURRENT_BINARY_DIR}/boost_capy-config-version.cmake
+        COMPATIBILITY SameMajorVersion)
+
+    install(FILES
+        ${CMAKE_CURRENT_BINARY_DIR}/boost_capy-config.cmake
+        ${CMAKE_CURRENT_BINARY_DIR}/boost_capy-config-version.cmake
+        DESTINATION ${BOOST_CAPY_INSTALL_CMAKEDIR})
+endif()
+
+if(BOOST_CAPY_BUILD_BENCH)
+    add_subdirectory(bench)
+endif()
 
-#-------------------------------------------------
-#
-# Examples (before tests so Boost::asio is available)
-#
-#-------------------------------------------------
 if (BOOST_CAPY_BUILD_EXAMPLES)
     add_subdirectory(example)
 endif ()
 
-#-------------------------------------------------
-#
-# Tests
-#
-#-------------------------------------------------
 if (BOOST_CAPY_BUILD_TESTS)
     add_subdirectory(test)
 endif ()
diff --git a/CMakePresets.json b/CMakePresets.json
deleted file mode 100644
index eafecbdf3..000000000
--- a/CMakePresets.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-    "version": 6,
-    "cmakeMinimumRequired": {
-        "major": 3,
-        "minor": 20,
-        "patch": 0
-    },
-    "configurePresets": [
-        {
-            "name": "standalone",
-            "displayName": "Standalone Build",
-            "description": "Build capy standalone (no Boost tree required)",
-            "generator": "Ninja",
-            "binaryDir": "${sourceDir}/out/${presetName}",
-            "cacheVariables": {
-                "CMAKE_CXX_STANDARD": "20",
-                "CMAKE_BUILD_TYPE": "Release",
-                "BOOST_CAPY_BUILD_TESTS": "OFF",
-                "BOOST_CAPY_BUILD_EXAMPLES": "OFF"
-            }
-        }
-    ],
-    "buildPresets": [
-        {
-            "name": "standalone",
-            "configurePreset": "standalone"
-        }
-    ]
-}
diff --git a/README.md b/README.md
index caf6ff02e..cffa42774 100644
--- a/README.md
+++ b/README.md
@@ -11,16 +11,30 @@ This library provides facilities which use C++20 coroutines to perform I/O. It i
 
 ## Quick Start
 
-Clone and build with CMake:
+### Standalone build
 
 ```bash
 git clone https://github.com/cppalliance/capy.git
 cd capy
-cmake --preset standalone
-cmake --build --preset standalone
+cmake -B build -G Ninja -DCMAKE_BUILD_TYPE=Release
+cmake --build build
 ```
 
-The library is built to `out/standalone/`.
+### Consume via CMake
+
+Use `FetchContent` or `add_subdirectory` to add capy to your project,
+then link against `Boost::capy`:
+
+```cmake
+include(FetchContent)
+FetchContent_Declare(capy
+    GIT_REPOSITORY https://github.com/cppalliance/capy.git
+    GIT_TAG develop
+    GIT_SHALLOW TRUE)
+FetchContent_MakeAvailable(capy)
+
+target_link_libraries(my_app Boost::capy)
+```
 
 ## Related Libraries
 
diff --git a/bench/CMakeLists.txt b/bench/CMakeLists.txt
index 7eead0ad1..8cfa90eda 100644
--- a/bench/CMakeLists.txt
+++ b/bench/CMakeLists.txt
@@ -7,14 +7,73 @@
 # Official repository: https://github.com/cppalliance/capy
 #
 
-file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS *.cpp *.hpp)
-list(APPEND PFILES CMakeLists.txt)
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES
+    CMakeLists.txt bench.cpp allocation.cpp
+    beman/main.cpp beman/beman_env.hpp beman/bench_pool.hpp
+    stdexec/main.cpp
+    stdexec/allocation_tracker.hpp
+    stdexec/awaitable_sender.hpp
+    stdexec/ioaw_io_read_stream.hpp
+    stdexec/ioaw_read_stream.hpp
+    stdexec/ioaw_sync_read_stream.hpp
+    stdexec/sender_awaitable.hpp
+    stdexec/sender_io_env.hpp
+    stdexec/sndr_any_read_stream.hpp
+    stdexec/sndr_io_read_stream.hpp
+    stdexec/sndr_read_stream.hpp
+    stdexec/sndr_sync_read_stream.hpp)
 
-source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+add_executable(boost_capy_bench bench.cpp)
+target_link_libraries(boost_capy_bench PRIVATE Boost::capy)
+target_include_directories(boost_capy_bench PRIVATE .)
 
-add_executable(boost_capy_bench ${PFILES})
-target_link_libraries(
-    boost_capy_bench PRIVATE
-    Boost::capy)
+add_executable(boost_capy_bench_allocation allocation.cpp)
+target_link_libraries(boost_capy_bench_allocation PRIVATE Boost::capy)
 
-target_include_directories(boost_capy_bench PRIVATE .)
+if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
+    include(FetchContent)
+    FetchContent_Declare(
+        beman-task
+        GIT_REPOSITORY https://github.com/bemanproject/task
+        GIT_TAG 6163df9
+        SYSTEM
+        FIND_PACKAGE_ARGS
+            NAMES beman.task
+    )
+    FetchContent_MakeAvailable(beman-task)
+
+    add_executable(boost_capy_bench_beman beman/main.cpp)
+    target_compile_features(boost_capy_bench_beman PRIVATE cxx_std_23)
+    target_link_libraries(boost_capy_bench_beman PRIVATE
+        Boost::capy beman::task beman::execution_headers)
+endif()
+
+if(BOOST_CAPY_BUILD_STDEXEC_EXAMPLES)
+    include(FetchContent)
+    FetchContent_Declare(
+        stdexec
+        GIT_REPOSITORY https://github.com/NVIDIA/stdexec
+        GIT_TAG 307b83c5689ea7c2e5b31561cdc428697705333e
+        SYSTEM
+        FIND_PACKAGE_ARGS
+            NAMES stdexec
+    )
+    FetchContent_MakeAvailable(stdexec)
+
+    add_executable(boost_capy_bench_stdexec stdexec/main.cpp)
+    target_compile_features(boost_capy_bench_stdexec PRIVATE cxx_std_23)
+    target_link_libraries(boost_capy_bench_stdexec PRIVATE
+        Boost::capy STDEXEC::stdexec)
+endif()
+
+if(BUILD_SHARED_LIBS)
+    include(FetchContent)
+    FetchContent_Declare(mimalloc
+        GIT_REPOSITORY https://github.com/microsoft/mimalloc
+        GIT_TAG v2.2.7
+        GIT_SHALLOW TRUE)
+    set(MI_BUILD_TESTS OFF CACHE BOOL "Disable mimalloc tests" FORCE)
+    FetchContent_MakeAvailable(mimalloc)
+    target_link_libraries(boost_capy_bench_allocation PRIVATE mimalloc-static)
+    target_compile_definitions(boost_capy_bench_allocation PRIVATE BOOST_CAPY_HAS_MIMALLOC=1)
+endif()
diff --git a/example/allocation/allocation.cpp b/bench/allocation.cpp
similarity index 88%
rename from example/allocation/allocation.cpp
rename to bench/allocation.cpp
index b2bcfc360..20ad589cd 100644
--- a/example/allocation/allocation.cpp
+++ b/bench/allocation.cpp
@@ -38,7 +38,7 @@
 # define CAPY_NOINLINE
 #endif
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 std::atomic<std::size_t> counter{0};
 
@@ -82,31 +82,31 @@ class mi_memory_resource
 // business logic awaiting an HTTP client, awaiting
 // a TLS stream, awaiting a tcp_socket
 
-CAPY_NOINLINE task<> depth_4()
+CAPY_NOINLINE capy::task<> depth_4()
 {
     counter.fetch_add(1, std::memory_order_relaxed);
     co_return;
 }
 
-CAPY_NOINLINE task<> depth_3()
+CAPY_NOINLINE capy::task<> depth_3()
 {
     for(int i = 0; i < 3; ++i)
         co_await depth_4();
 }
 
-CAPY_NOINLINE task<> depth_2()
+CAPY_NOINLINE capy::task<> depth_2()
 {
     for(int i = 0; i < 3; ++i)
         co_await depth_3();
 }
 
-CAPY_NOINLINE task<> depth_1()
+CAPY_NOINLINE capy::task<> depth_1()
 {
     for(int i = 0; i < 5; ++i)
         co_await depth_2();
 }
 
-CAPY_NOINLINE task<> bench_loop(std::size_t n)
+CAPY_NOINLINE capy::task<> bench_loop(std::size_t n)
 {
     for(std::size_t i = 0; i < n; ++i)
         co_await depth_1();
@@ -120,9 +120,9 @@ int main()
     counter.store(0);
     auto t0 = std::chrono::steady_clock::now();
     {
-        test::blocking_context ctx;
-        ctx.set_frame_allocator(get_recycling_memory_resource());
-        run_async(ctx.get_executor(),
+        capy::test::blocking_context ctx;
+        ctx.set_frame_allocator(capy::get_recycling_memory_resource());
+        capy::run_async(ctx.get_executor(),
             [&] { ctx.signal_done(); })(
             bench_loop(iterations));
         ctx.run();
@@ -135,9 +135,9 @@ int main()
     mi_memory_resource mi_mr;
     auto t2 = std::chrono::steady_clock::now();
     {
-        test::blocking_context ctx;
+        capy::test::blocking_context ctx;
         ctx.set_frame_allocator(&mi_mr);
-        run_async(ctx.get_executor(),
+        capy::run_async(ctx.get_executor(),
             [&] { ctx.signal_done(); })(
             bench_loop(iterations));
         ctx.run();
@@ -149,8 +149,8 @@ int main()
     counter.store(0);
     auto t4 = std::chrono::steady_clock::now();
     {
-        test::blocking_context ctx;
-        run_async(ctx.get_executor(), std::allocator<std::byte>{},
+        capy::test::blocking_context ctx;
+        capy::run_async(ctx.get_executor(), std::allocator<std::byte>{},
             [&] { ctx.signal_done(); })(
             bench_loop(iterations));
         ctx.run();
diff --git a/bench/beman/allocation_tracker.hpp b/bench/beman/allocation_tracker.hpp
new file mode 100644
index 000000000..e8c43f5d0
--- /dev/null
+++ b/bench/beman/allocation_tracker.hpp
@@ -0,0 +1,71 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_ALLOCATION_TRACKER_HPP
+#define BOOST_CAPY_BENCH_ALLOCATION_TRACKER_HPP
+
+#include <atomic>
+#include <cstddef>
+#include <cstdlib>
+#include <memory_resource>
+#include <new>
+
+static std::atomic<int64_t> g_alloc_count{0};
+
+/// Counts every allocate call, then delegates to upstream.
+class counting_memory_resource
+    : public std::pmr::memory_resource
+{
+    std::pmr::memory_resource* upstream_;
+
+    void* do_allocate(
+        std::size_t n, std::size_t align) override
+    {
+        g_alloc_count.fetch_add(1, std::memory_order_relaxed);
+        return upstream_->allocate(n, align);
+    }
+
+    void do_deallocate(
+        void* p, std::size_t n, std::size_t align) override
+    {
+        upstream_->deallocate(p, n, align);
+    }
+
+    bool do_is_equal(
+        memory_resource const& other) const noexcept override
+    {
+        return this == &other;
+    }
+
+public:
+    explicit counting_memory_resource(
+        std::pmr::memory_resource* upstream) noexcept
+        : upstream_(upstream) {}
+};
+
+void* operator new(std::size_t n)
+{
+    g_alloc_count.fetch_add(1, std::memory_order_relaxed);
+    void* p = std::malloc(n);
+    if (!p)
+        throw std::bad_alloc();
+    return p;
+}
+
+void operator delete(void* p) noexcept
+{
+    std::free(p);
+}
+
+void operator delete(void* p, std::size_t) noexcept
+{
+    std::free(p);
+}
+
+#endif
diff --git a/bench/beman/awaitable_sender.hpp b/bench/beman/awaitable_sender.hpp
new file mode 100644
index 000000000..0e1f3d8cc
--- /dev/null
+++ b/bench/beman/awaitable_sender.hpp
@@ -0,0 +1,558 @@
+//
+// Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_AWAITABLE_SENDER_HPP
+#define BOOST_CAPY_BENCH_AWAITABLE_SENDER_HPP
+
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/detail/await_suspend_helper.hpp>
+#include <boost/capy/ex/executor_ref.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <beman/execution/execution.hpp>
+
+#include <concepts>
+#include <coroutine>
+#include <exception>
+#include <stop_token>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace boost::capy {
+
+// Query CPO for obtaining a Capy-compatible executor
+// from a P2300 environment. The returned object must
+// satisfy Capy's Executor concept. Environments that
+// host IoAwaitables via the as_sender bridge must
+// answer this query.
+struct get_io_executor_t
+{
+    constexpr bool query(
+        beman::execution::forwarding_query_t const&)
+            const noexcept
+    {
+        return true;
+    }
+
+    template<class Env>
+        requires requires(Env const& env) {
+            env.query(
+                std::declval<get_io_executor_t const&>());
+        }
+    auto operator()(Env const& env) const noexcept
+    {
+        return env.query(*this);
+    }
+};
+
+inline constexpr get_io_executor_t get_io_executor{};
+
+namespace detail {
+
+template<class T, class = void>
+struct has_tuple_protocol : std::false_type {};
+
+template<class T>
+struct has_tuple_protocol<T,
+    std::void_t<
+        typename std::tuple_size<T>::type,
+        typename std::tuple_element<0, T>::type>>
+    : std::true_type {};
+
+template<class T, bool = has_tuple_protocol<T>::value>
+struct is_ec_outcome : std::is_same<T, std::error_code> {};
+
+template<class T>
+struct is_ec_outcome<T, true>
+    : std::bool_constant<
+        std::tuple_size_v<T> == 1 &&
+        std::is_same_v<
+            std::tuple_element_t<0, T>,
+            std::error_code>>
+{};
+
+template<class T>
+constexpr bool is_ec_outcome_v =
+    std::is_same_v<T, std::error_code> ||
+    is_ec_outcome<T>::value;
+
+template<class T, bool = has_tuple_protocol<T>::value>
+struct is_compound_ec_result : std::false_type {};
+
+template<class T>
+struct is_compound_ec_result<T, true>
+    : std::bool_constant<
+        std::tuple_size_v<T> >= 2 &&
+        std::is_same_v<
+            std::tuple_element_t<0, T>,
+            std::error_code>>
+{};
+
+template<class T>
+constexpr bool is_compound_ec_result_v =
+    is_compound_ec_result<T>::value;
+
+struct frame_cb
+{
+    void (*resume)(frame_cb*);
+    void (*destroy)(frame_cb*);
+    void* data;
+};
+
+} // namespace detail
+
+/** Sender that wraps an IoAwaitable.
+
+    When connected or co_awaited, the bridge queries
+    the receiver's or promise's environment for a
+    Capy-compatible executor via get_io_executor.
+    The executor is stored by value in the operation
+    state and used to construct the io_env passed to
+    the IoAwaitable's await_suspend.
+
+    @tparam IoAw The IoAwaitable type.
+*/
+template<class IoAw>
+struct awaitable_sender
+{
+    using sender_concept = beman::execution::sender_t;
+
+    using result_type = decltype(
+        std::declval<std::decay_t<IoAw>&>().await_resume());
+
+    static auto make_sigs()
+    {
+        if constexpr (std::is_void_v<result_type>)
+            return beman::execution::completion_signatures<
+                beman::execution::set_value_t(),
+                beman::execution::set_error_t(std::exception_ptr),
+                beman::execution::set_stopped_t()>{};
+        else if constexpr (
+            detail::is_compound_ec_result_v<result_type>)
+            return beman::execution::completion_signatures<
+                beman::execution::set_value_t(
+                    std::tuple_element_t<1, result_type>),
+                beman::execution::set_error_t(std::error_code),
+                beman::execution::set_error_t(std::exception_ptr),
+                beman::execution::set_stopped_t()>{};
+        else if constexpr (
+            detail::is_ec_outcome_v<result_type>)
+            return beman::execution::completion_signatures<
+                beman::execution::set_value_t(),
+                beman::execution::set_error_t(std::error_code),
+                beman::execution::set_error_t(std::exception_ptr),
+                beman::execution::set_stopped_t()>{};
+        else
+            return beman::execution::completion_signatures<
+                beman::execution::set_value_t(result_type),
+                beman::execution::set_error_t(std::exception_ptr),
+                beman::execution::set_stopped_t()>{};
+    }
+
+    using completion_signatures = decltype(make_sigs());
+
+    IoAw aw_;
+
+    template<class Receiver>
+    struct op_state
+    {
+        using operation_state_concept =
+            beman::execution::operation_state_t;
+
+        using executor_type = decltype(
+            beman::execution::get_scheduler(
+                beman::execution::get_env(
+                    std::declval<Receiver const&>()))
+                        .query(get_io_executor_t{}));
+
+        IoAw aw_;
+        Receiver rcvr_;
+        executor_type ex_;
+        io_env env_;
+        detail::frame_cb cb_;
+
+        op_state(IoAw aw, Receiver rcvr)
+            : aw_(std::move(aw))
+            , rcvr_(std::move(rcvr))
+            , ex_{}
+            , cb_{}
+        {
+        }
+
+        op_state(op_state const&) = delete;
+        op_state(op_state&&) = delete;
+        op_state& operator=(op_state const&) = delete;
+        op_state& operator=(op_state&&) = delete;
+
+        static void
+        on_resume(detail::frame_cb* p) noexcept
+        {
+            auto* self = static_cast<op_state*>(p->data);
+            self->complete();
+        }
+
+        static void
+        on_destroy(detail::frame_cb*) noexcept
+        {
+        }
+
+        void complete() noexcept
+        {
+            try
+            {
+                if constexpr (std::is_void_v<result_type>)
+                {
+                    aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                        beman::execution::set_stopped(
+                            std::move(rcvr_));
+                    else
+                        beman::execution::set_value(
+                            std::move(rcvr_));
+                }
+                else if constexpr (
+                    detail::is_compound_ec_result_v<result_type>)
+                {
+                    auto result = aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                    {
+                        beman::execution::set_stopped(
+                            std::move(rcvr_));
+                    }
+                    else
+                    {
+                        auto ec = get<0>(result);
+                        if(!ec)
+                            beman::execution::set_value(
+                                std::move(rcvr_),
+                                get<1>(std::move(result)));
+                        else
+                            beman::execution::set_error(
+                                std::move(rcvr_), ec);
+                    }
+                }
+                else if constexpr (
+                    detail::is_ec_outcome_v<result_type>)
+                {
+                    auto result = aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                    {
+                        beman::execution::set_stopped(
+                            std::move(rcvr_));
+                    }
+                    else
+                    {
+                        std::error_code ec;
+                        if constexpr (std::is_same_v<
+                            result_type, std::error_code>)
+                            ec = result;
+                        else
+                            ec = get<0>(result);
+                        if(!ec)
+                            beman::execution::set_value(
+                                std::move(rcvr_));
+                        else
+                            beman::execution::set_error(
+                                std::move(rcvr_), ec);
+                    }
+                }
+                else
+                {
+                    auto result = aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                        beman::execution::set_stopped(
+                            std::move(rcvr_));
+                    else
+                        beman::execution::set_value(
+                            std::move(rcvr_),
+                            std::move(result));
+                }
+            }
+            catch(...)
+            {
+                beman::execution::set_error(
+                    std::move(rcvr_),
+                    std::current_exception());
+            }
+        }
+
+        void start() noexcept
+        {
+            auto renv = beman::execution::get_env(rcvr_);
+            ex_ = beman::execution::get_scheduler(renv)
+                .query(get_io_executor_t{});
+
+            std::stop_token st;
+            if constexpr (requires {
+                { renv.query(beman::execution::get_stop_token_t{}) }
+                    -> std::convertible_to<std::stop_token>; })
+            {
+                st = renv.query(
+                    beman::execution::get_stop_token_t{});
+            }
+
+            env_ = io_env{ex_, st, nullptr};
+
+            if(aw_.await_ready())
+            {
+                complete();
+                return;
+            }
+
+            cb_.resume = &on_resume;
+            cb_.destroy = &on_destroy;
+            cb_.data = this;
+
+            auto h = std::coroutine_handle<>::from_address(
+                static_cast<void*>(&cb_));
+
+            auto resumed = detail::call_await_suspend(
+                &aw_, h, &env_);
+            if(resumed == h)
+                complete();
+        }
+    };
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) &&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            std::move(aw_), std::move(rcvr));
+    }
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) const&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(aw_, std::move(rcvr));
+    }
+
+    // Bypass beman's sender_awaitable when co_awaited
+    // from a bex::task. Adapts the IoAwaitable's 2-arg
+    // await_suspend to standard 1-arg protocol, avoiding
+    // the double bridge (as_sender + sender_awaitable).
+    template<class Promise>
+    auto as_awaitable(Promise& promise) &&
+    {
+        auto penv = promise.get_env();
+        auto sched = beman::execution::get_scheduler(penv);
+
+        using executor_type = decltype(
+            sched.query(get_io_executor_t{}));
+
+        auto ex = sched.query(get_io_executor_t{});
+
+        std::stop_token st;
+        if constexpr (requires {
+            { penv.query(beman::execution::get_stop_token_t{}) }
+                -> std::convertible_to<std::stop_token>; })
+        {
+            st = penv.query(
+                beman::execution::get_stop_token_t{});
+        }
+
+        struct aw
+        {
+            IoAw aw_;
+            executor_type ex_;
+            std::stop_token st_;
+            io_env env_;
+
+            bool await_ready() noexcept
+            {
+                return aw_.await_ready();
+            }
+
+            std::coroutine_handle<>
+            await_suspend(std::coroutine_handle<> h)
+            {
+                env_ = io_env{ex_, st_, nullptr};
+                return aw_.await_suspend(h, &env_);
+            }
+
+            auto await_resume()
+            {
+                return aw_.await_resume();
+            }
+        };
+
+        return aw{std::move(aw_), std::move(ex), st, {}};
+    }
+};
+
+/** Create a beman::execution sender from an IoAwaitable.
+
+    The bridge routes the awaitable's result through sender
+    channels based on its type:
+
+    - `void` - calls `set_value()`.
+    - `error_code` (or a single-element tuple-like whose
+      element 0 is `error_code`) - calls `set_value()`
+      when the code is zero, `set_error(ec)` otherwise.
+    - Any other single value `T` - calls `set_value(T)`.
+    - Compound results whose element 0 is `error_code`
+      with additional elements are rejected at compile
+      time. Wrap the operation in a `task<error_code>`
+      that inspects the compound result and returns the
+      error code.
+
+    When connected or co_awaited, the bridge queries the
+    receiver's or promise's environment for a Capy executor
+    via get_io_executor. The environment must answer this
+    query with an object satisfying Capy's Executor concept.
+
+    @param aw The IoAwaitable to wrap.
+    @return A sender whose completion channels reflect
+        the awaitable's result type.
+*/
+template<class IoAw>
+auto as_sender(IoAw&& aw)
+{
+    return awaitable_sender<std::decay_t<IoAw>>{
+        std::forward<IoAw>(aw)};
+}
+
+// -------------------------------------------------------
+// split_ec: sender adapter that routes error_code to
+// set_value() or set_error(ec) at runtime.
+// -------------------------------------------------------
+
+namespace detail {
+
+template<class Sender>
+struct split_ec_sender
+{
+    using sender_concept = beman::execution::sender_t;
+
+    using completion_signatures =
+        beman::execution::completion_signatures<
+            beman::execution::set_value_t(),
+            beman::execution::set_error_t(std::error_code),
+            beman::execution::set_error_t(std::exception_ptr),
+            beman::execution::set_stopped_t()>;
+
+    Sender sndr_;
+
+    template<class Receiver>
+    struct ec_receiver
+    {
+        using receiver_concept = beman::execution::receiver_t;
+
+        Receiver rcvr_;
+
+        auto get_env() const noexcept
+        {
+            return beman::execution::get_env(rcvr_);
+        }
+
+        void set_value(std::error_code ec) && noexcept
+        {
+            if (!ec)
+                beman::execution::set_value(
+                    std::move(rcvr_));
+            else
+                beman::execution::set_error(
+                    std::move(rcvr_), ec);
+        }
+
+        void set_value() && noexcept
+        {
+            beman::execution::set_value(
+                std::move(rcvr_));
+        }
+
+        template<class E>
+        void set_error(E&& e) && noexcept
+        {
+            beman::execution::set_error(
+                std::move(rcvr_),
+                std::forward<E>(e));
+        }
+
+        void set_stopped() && noexcept
+        {
+            beman::execution::set_stopped(
+                std::move(rcvr_));
+        }
+    };
+
+    template<class Receiver>
+    struct op_state
+    {
+        using operation_state_concept =
+            beman::execution::operation_state_t;
+
+        using inner_op_t = decltype(
+            beman::execution::connect(
+                std::declval<Sender>(),
+                std::declval<ec_receiver<Receiver>>()));
+
+        inner_op_t op_;
+
+        op_state(Sender sndr, Receiver rcvr)
+            : op_(beman::execution::connect(
+                std::move(sndr),
+                ec_receiver<Receiver>{std::move(rcvr)}))
+        {
+        }
+
+        op_state(op_state const&) = delete;
+        op_state(op_state&&) = delete;
+        op_state& operator=(op_state const&) = delete;
+        op_state& operator=(op_state&&) = delete;
+
+        void start() noexcept
+        {
+            beman::execution::start(op_);
+        }
+    };
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) &&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            std::move(sndr_), std::move(rcvr));
+    }
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) const&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            sndr_, std::move(rcvr));
+    }
+};
+
+} // namespace detail
+
+/** Split an `error_code` value channel into success and error channels.
+
+    Takes a sender that completes with `set_value(error_code)` and
+    routes it at runtime: `set_value()` when the code is zero,
+    `set_error(ec)` otherwise. No exceptions.
+
+    @param sndr The predecessor sender.
+    @return A sender completing with `set_value()`,
+        `set_error(error_code)`, or `set_stopped()`.
+*/
+template<class Sender>
+auto split_ec(Sender&& sndr)
+{
+    return detail::split_ec_sender<
+        std::decay_t<Sender>>{
+            std::forward<Sender>(sndr)};
+}
+
+} // namespace boost::capy
+
+#endif
diff --git a/bench/beman/ioaw_io_read_stream.hpp b/bench/beman/ioaw_io_read_stream.hpp
new file mode 100644
index 000000000..beb819fa3
--- /dev/null
+++ b/bench/beman/ioaw_io_read_stream.hpp
@@ -0,0 +1,36 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_IOAW_IO_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_IOAW_IO_READ_STREAM_HPP
+
+#include "ioaw_read_stream.hpp"
+
+/// Abstract interface for IoAwaitable read streams.
+struct ioaw_io_read_stream
+{
+    virtual ioaw_read_stream::read_awaitable
+        read_some(boost::capy::mutable_buffer) = 0;
+    virtual ~ioaw_io_read_stream() = default;
+};
+
+/// Concrete implementation of ioaw_io_read_stream wrapping
+/// an ioaw_read_stream.
+struct ioaw_io_read_stream_impl : ioaw_io_read_stream
+{
+    ioaw_read_stream stream_;
+
+    ioaw_read_stream::read_awaitable
+        read_some(boost::capy::mutable_buffer buf) override
+    {
+        return stream_.read_some(buf);
+    }
+};
+
+#endif
diff --git a/bench/beman/ioaw_read_stream.hpp b/bench/beman/ioaw_read_stream.hpp
new file mode 100644
index 000000000..5eb0dd4f8
--- /dev/null
+++ b/bench/beman/ioaw_read_stream.hpp
@@ -0,0 +1,57 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_IOAW_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_IOAW_READ_STREAM_HPP
+
+#include <boost/capy/buffers.hpp>
+#include <boost/capy/concept/read_stream.hpp>
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+#include <coroutine>
+#include <cstddef>
+
+/// No-op ReadStream for benchmarking.
+///
+/// Uses the executor from io_env (passed by capy::task's
+/// transform_awaiter) to post the coroutine back. Satisfies
+/// ReadStream so it can be wrapped by capy::any_read_stream.
+struct ioaw_read_stream
+{
+    struct read_awaitable
+    {
+        boost::capy::continuation cont_{};
+
+        bool await_ready() const noexcept { return false; }
+
+        std::coroutine_handle<>
+        await_suspend(
+            std::coroutine_handle<> h,
+            boost::capy::io_env const* env)
+        {
+            cont_.h = h;
+            env->executor.post(cont_);
+            return std::noop_coroutine();
+        }
+
+        boost::capy::io_result<std::size_t>
+        await_resume() noexcept { return {{}, 0}; }
+    };
+
+    template <boost::capy::MutableBufferSequence MB>
+    read_awaitable read_some(MB)
+    {
+        return {};
+    }
+};
+
+static_assert(boost::capy::ReadStream<ioaw_read_stream>);
+
+#endif
diff --git a/bench/beman/ioaw_sync_read_stream.hpp b/bench/beman/ioaw_sync_read_stream.hpp
new file mode 100644
index 000000000..cdb68c1fc
--- /dev/null
+++ b/bench/beman/ioaw_sync_read_stream.hpp
@@ -0,0 +1,59 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Synchronous-completion IoAwaitable stream.
+//
+// Every read completes immediately via symmetric
+// transfer — await_suspend returns the coroutine
+// handle, causing an inline resume with no scheduler
+// round-trip.
+//
+
+#ifndef BOOST_CAPY_BENCH_IOAW_SYNC_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_IOAW_SYNC_READ_STREAM_HPP
+
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <coroutine>
+#include <cstddef>
+
+struct ioaw_sync_read_stream
+{
+    struct read_awaitable
+    {
+        bool await_ready() const noexcept
+        {
+            return false;
+        }
+
+        std::coroutine_handle<>
+        await_suspend(
+            std::coroutine_handle<> h,
+            boost::capy::io_env const*)
+        {
+            // Data already buffered — resume inline
+            return h;
+        }
+
+        boost::capy::io_result<std::size_t>
+        await_resume() noexcept
+        {
+            return {{}, 0};
+        }
+    };
+
+    read_awaitable read_some(auto)
+    {
+        return {};
+    }
+};
+
+#endif
diff --git a/bench/beman/main.cpp b/bench/beman/main.cpp
new file mode 100644
index 000000000..60d096f31
--- /dev/null
+++ b/bench/beman/main.cpp
@@ -0,0 +1,793 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// I/O Read Stream Benchmark
+//
+// Compares three execution models across three stream abstraction
+// levels. 20M read_some calls per cell, single thread.
+//
+// Table 1: sender pipeline   (connect/start)
+// Table 2: capy::task        (capy::thread_pool)
+// Table 3: bex::task         (sender_thread_pool)
+//
+// Each table has three rows:
+//   Native      — concrete stream, full visibility
+//   Abstract    — virtual dispatch, implementation hidden
+//   Type erased — value-type erasure
+//
+
+#include "allocation_tracker.hpp"
+#include "awaitable_sender.hpp"
+#include "ioaw_read_stream.hpp"
+#include "ioaw_io_read_stream.hpp"
+#include "repeat_until.hpp"
+#include "sender_awaitable.hpp"
+#include "sndr_any_read_stream.hpp"
+#include "sndr_io_read_stream.hpp"
+#include "sndr_read_stream.hpp"
+#include "sndr_sync_read_stream.hpp"
+#include "ioaw_sync_read_stream.hpp"
+#include "sender_io_env.hpp"
+
+#include <boost/capy.hpp>
+#include <boost/capy/io/any_read_stream.hpp>
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <memory>
+
+namespace bex = beman::execution;
+namespace capy = boost::capy;
+
+static counting_memory_resource g_counting_resource{
+    capy::get_recycling_memory_resource()};
+
+auto get_counting_resource() -> std::pmr::memory_resource*
+{
+    return &g_counting_resource;
+}
+
+// ===================================================================
+// result collection
+// ===================================================================
+
+struct cell_result
+{
+    long long ns = 0;
+    int64_t allocs = 0;
+};
+
+static constexpr int OPS_PER_CELL = 20'000'000;
+static constexpr int OUTER_LOOPS = 2'000;
+static constexpr int INNER_LOOPS = 10'000;
+
+static constexpr int NUM_RUNS    = 5;
+static constexpr int NUM_TABLES  = 3;
+static constexpr int NUM_STREAMS = 4;
+static constexpr int NUM_COLUMNS = 2;
+
+static constexpr int SENDER_RECEIVER = 0;
+static constexpr int CAPY_TASK       = 1;
+static constexpr int BEMAN_TASK      = 2;
+
+static constexpr int NATIVE_STREAM       = 0;
+static constexpr int ABSTRACT_STREAM     = 1;
+static constexpr int TYPE_ERASED_STREAM  = 2;
+static constexpr int SYNC_STREAM         = 3;
+
+static constexpr int NATIVE_EXEC_MODEL = 0;
+static constexpr int BRIDGED_EXEC_MODEL = 1;
+
+
+// ===================================================================
+// Table 1: capy::task
+//
+// Templated session/accept coroutines instantiated with each
+// stream type. The executor comes from io_env via capy::task's
+// transform_awaiter.
+// ===================================================================
+
+template <class Stream>
+capy::task<> capy_session(Stream& stream)
+{
+    char buf[64];
+    for (int i = 0; i < INNER_LOOPS; ++i)
+        (void)co_await stream.read_some(
+            capy::mutable_buffer(buf, sizeof(buf)));
+}
+
+template <class Stream>
+capy::task<> capy_accept(Stream& stream, cell_result& out)
+{
+    auto before = g_alloc_count.load(std::memory_order_relaxed);
+    auto start = std::chrono::steady_clock::now();
+
+    for (int i = 0; i < OUTER_LOOPS; ++i)
+        co_await capy_session(stream);
+
+    auto elapsed = std::chrono::steady_clock::now() - start;
+    auto after = g_alloc_count.load(std::memory_order_relaxed);
+    out = {std::chrono::duration_cast<
+        std::chrono::nanoseconds>(elapsed).count(),
+        after - before};
+}
+
+// ===================================================================
+// Table 1: capy::task — Column B (sender via await_sender bridge)
+//
+// The stream returns a sender. capy::task consumes it by wrapping
+// in await_sender which bridges the sender to an IoAwaitable.
+// Single pool: sender_thread_pool with sender_as_capy_executor
+// adapter so capy::task can run on it.
+// ===================================================================
+
+template <class Stream>
+capy::task<> capy_session_sndr(Stream& stream)
+{
+    char buf[64];
+    for (int i = 0; i < INNER_LOOPS; ++i)
+        (void)co_await capy::await_sender(
+            stream.read_some(
+                capy::mutable_buffer(buf, sizeof(buf))));
+}
+
+template <class Stream>
+capy::task<> capy_accept_sndr(Stream& stream, cell_result& out)
+{
+    auto before = g_alloc_count.load(std::memory_order_relaxed);
+    auto start = std::chrono::steady_clock::now();
+
+    for (int i = 0; i < OUTER_LOOPS; ++i)
+        co_await capy_session_sndr(stream);
+
+    auto elapsed = std::chrono::steady_clock::now() - start;
+    auto after = g_alloc_count.load(std::memory_order_relaxed);
+    out = {std::chrono::duration_cast<
+        std::chrono::nanoseconds>(elapsed).count(),
+        after - before};
+}
+
+// ===================================================================
+// Table 2: bex::task — Column A (sender, native)
+//
+// Same templated pattern but using bex::task<void, io_env> coroutines on
+// sender_thread_pool.
+// ===================================================================
+
+template <class Stream>
+auto bex_session(
+    Stream& stream,
+    std::allocator_arg_t,
+    std::pmr::polymorphic_allocator<std::byte>) -> bex::task<void, io_env>
+{
+    char buf[64];
+    for (int i = 0; i < INNER_LOOPS; ++i)
+        (void)co_await stream.read_some(
+            capy::mutable_buffer(buf, sizeof(buf)));
+}
+
+template <class Stream>
+auto bex_accept(
+    Stream& stream,
+    cell_result& out,
+    std::allocator_arg_t,
+    std::pmr::polymorphic_allocator<std::byte> alloc) -> bex::task<void, io_env>
+{
+    auto before = g_alloc_count.load(std::memory_order_relaxed);
+    auto start = std::chrono::steady_clock::now();
+
+    for (int i = 0; i < OUTER_LOOPS; ++i)
+        co_await bex_session(stream, std::allocator_arg, alloc);
+
+    auto elapsed = std::chrono::steady_clock::now() - start;
+    auto after = g_alloc_count.load(std::memory_order_relaxed);
+    out = {std::chrono::duration_cast<
+        std::chrono::nanoseconds>(elapsed).count(),
+        after - before};
+}
+
+// ===================================================================
+// Table 2: bex::task — Column B (awaitable via as_sender bridge)
+//
+// The stream returns an IoAwaitable. bex::task consumes it by
+// wrapping in as_sender which bridges the awaitable to a sender.
+// ===================================================================
+
+template <class Stream>
+auto bex_session_ioaw(
+    Stream& stream,
+    std::allocator_arg_t,
+    std::pmr::polymorphic_allocator<std::byte>) -> bex::task<void, io_env>
+{
+    char buf[64];
+    for (int i = 0; i < INNER_LOOPS; ++i)
+        (void)co_await capy::as_sender(
+            stream.read_some(
+                capy::mutable_buffer(buf, sizeof(buf))));
+}
+
+template <class Stream>
+auto bex_accept_ioaw(
+    Stream& stream,
+    cell_result& out,
+    std::allocator_arg_t,
+    std::pmr::polymorphic_allocator<std::byte> alloc) -> bex::task<void, io_env>
+{
+    auto before = g_alloc_count.load(std::memory_order_relaxed);
+    auto start = std::chrono::steady_clock::now();
+
+    for (int i = 0; i < OUTER_LOOPS; ++i)
+        co_await bex_session_ioaw(stream,
+            std::allocator_arg, alloc);
+
+    auto elapsed = std::chrono::steady_clock::now() - start;
+    auto after = g_alloc_count.load(std::memory_order_relaxed);
+    out = {std::chrono::duration_cast<
+        std::chrono::nanoseconds>(elapsed).count(),
+        after - before};
+}
+
+// ===================================================================
+// main
+// ===================================================================
+
+int main()
+{
+    cell_result grid[NUM_RUNS + 1][NUM_TABLES][NUM_STREAMS][NUM_COLUMNS]{};
+
+    // run 0 is a warmup pass (results discarded),
+    // measured runs are 1..NUM_RUNS
+    for (int run = 0; run <= NUM_RUNS; ++run)
+    {
+
+
+    // ---------------------------------------------------------------
+    // Table 1: sender/receiver pipeline (repeat_until)
+    // ---------------------------------------------------------------
+
+    // Col A: Sender (native)
+
+
+    // Native — sndr_read_stream
+    {
+        sender_thread_pool pool(1);
+        sndr_read_stream stream{&pool};
+        auto sched = pool.get_scheduler();
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(
+            std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        bex::sync_wait(bex::starts_on(sched,
+            repeat_until(
+                bex::let_value(bex::just(), [&]() {
+                    return stream.read_some(
+                        capy::mutable_buffer(buf, sizeof(buf)));
+                }),
+                [&count]() { return --count == 0; })));
+        pool.join();
+        auto elapsed =
+            std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(
+            std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][NATIVE_STREAM][NATIVE_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Abstract — sndr_io_read_stream
+    {
+        sender_thread_pool pool(1);
+        sndr_io_read_stream_impl stream{&pool};
+        auto sched = pool.get_scheduler();
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(
+            std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        bex::sync_wait(bex::starts_on(sched,
+            repeat_until(
+                bex::let_value(bex::just(), [&]() {
+                    return static_cast<sndr_io_read_stream&>(
+                        stream).read_some(
+                            capy::mutable_buffer(buf, sizeof(buf)));
+                }),
+                [&count]() { return --count == 0; })));
+        pool.join();
+        auto elapsed =
+            std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(
+            std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][ABSTRACT_STREAM][NATIVE_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Type erased — sndr_any_read_stream
+    {
+        sender_thread_pool pool(1);
+        sndr_any_read_stream stream(sndr_read_stream{&pool});
+        auto sched = pool.get_scheduler();
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(
+            std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        bex::sync_wait(bex::starts_on(sched,
+            repeat_until(
+                bex::let_value(bex::just(), [&]() {
+                    return stream.read_some(
+                        capy::mutable_buffer(buf, sizeof(buf)));
+                }),
+                [&count]() { return --count == 0; })));
+        pool.join();
+        auto elapsed =
+            std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(
+            std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][TYPE_ERASED_STREAM][NATIVE_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Col B: Awaitable (via as_sender bridge)
+
+
+    // Native — ioaw_read_stream
+    {
+        sender_thread_pool pool(1);
+        ioaw_read_stream stream;
+        auto sched = pool.get_scheduler();
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(
+            std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        bex::sync_wait(bex::starts_on(sched,
+            repeat_until(
+                bex::let_value(bex::just(), [&]() {
+                    return capy::as_sender(stream.read_some(
+                        capy::mutable_buffer(buf, sizeof(buf))));
+                }),
+                [&count]() { return --count == 0; })));
+        pool.join();
+        auto elapsed =
+            std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(
+            std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][NATIVE_STREAM][BRIDGED_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Abstract — ioaw_io_read_stream
+    {
+        sender_thread_pool pool(1);
+        ioaw_io_read_stream_impl stream;
+        auto sched = pool.get_scheduler();
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(
+            std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        bex::sync_wait(bex::starts_on(sched,
+            repeat_until(
+                bex::let_value(bex::just(), [&]() {
+                    return capy::as_sender(
+                        static_cast<ioaw_io_read_stream&>(
+                            stream).read_some(
+                                capy::mutable_buffer(
+                                    buf, sizeof(buf))));
+                }),
+                [&count]() { return --count == 0; })));
+        pool.join();
+        auto elapsed =
+            std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(
+            std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][ABSTRACT_STREAM][BRIDGED_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Type erased — capy::any_read_stream
+    {
+        sender_thread_pool pool(1);
+        ioaw_read_stream concrete;
+        capy::any_read_stream stream(&concrete);
+        auto sched = pool.get_scheduler();
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(
+            std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        bex::sync_wait(bex::starts_on(sched,
+            repeat_until(
+                bex::let_value(bex::just(), [&]() {
+                    return capy::as_sender(stream.read_some(
+                        capy::mutable_buffer(buf, sizeof(buf))));
+                }),
+                [&count]() { return --count == 0; })));
+        pool.join();
+        auto elapsed =
+            std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(
+            std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][TYPE_ERASED_STREAM][BRIDGED_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+
+    // Synchronous — sndr_sync_read_stream (Col A)
+    {
+        sender_thread_pool pool(1);
+        sndr_sync_read_stream stream;
+        auto sched = pool.get_scheduler();
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(
+            std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        bex::sync_wait(bex::starts_on(sched,
+            repeat_until(
+                bex::let_value(bex::just(), [&]() {
+                    return stream.read_some(
+                        capy::mutable_buffer(buf, sizeof(buf)));
+                }),
+                [&count]() { return --count == 0; })));
+        pool.join();
+        auto elapsed =
+            std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(
+            std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][SYNC_STREAM][NATIVE_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Synchronous — ioaw_sync_read_stream (Col B)
+    {
+        sender_thread_pool pool(1);
+        ioaw_sync_read_stream stream;
+        auto sched = pool.get_scheduler();
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(
+            std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        bex::sync_wait(bex::starts_on(sched,
+            repeat_until(
+                bex::let_value(bex::just(), [&]() {
+                    return capy::as_sender(stream.read_some(
+                        capy::mutable_buffer(buf, sizeof(buf))));
+                }),
+                [&count]() { return --count == 0; })));
+        pool.join();
+        auto elapsed =
+            std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(
+            std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][SYNC_STREAM][BRIDGED_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // ---------------------------------------------------------------
+    // Table 2: capy::task (capy::thread_pool)
+    // ---------------------------------------------------------------
+
+
+    // Native — ioaw_read_stream
+    {
+        capy::thread_pool pool(1);
+        ioaw_read_stream stream;
+        capy::run_async(pool.get_executor())(
+            capy_accept(stream, grid[run][CAPY_TASK][NATIVE_STREAM][NATIVE_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // Abstract — ioaw_io_read_stream
+    {
+        capy::thread_pool pool(1);
+        ioaw_io_read_stream_impl stream;
+        capy::run_async(pool.get_executor())(
+            capy_accept(static_cast<ioaw_io_read_stream&>(stream),
+                grid[run][CAPY_TASK][ABSTRACT_STREAM][NATIVE_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // Type erased — capy::any_read_stream
+    {
+        capy::thread_pool pool(1);
+        ioaw_read_stream concrete;
+        capy::any_read_stream stream(&concrete);
+        capy::run_async(pool.get_executor())(
+            capy_accept(stream, grid[run][CAPY_TASK][TYPE_ERASED_STREAM][NATIVE_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // Synchronous — ioaw_sync_read_stream
+    {
+        capy::thread_pool pool(1);
+        ioaw_sync_read_stream stream;
+        capy::run_async(pool.get_executor())(
+            capy_accept(stream, grid[run][CAPY_TASK][SYNC_STREAM][NATIVE_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // Col B: Sender (via await_sender bridge)
+
+    // Native — sndr_read_stream
+    {
+        sender_thread_pool pool(1);
+        sender_as_capy_executor adapter{&pool};
+        sndr_read_stream stream{&pool};
+        capy::run_async(adapter)(
+            capy_accept_sndr(stream, grid[run][CAPY_TASK][NATIVE_STREAM][BRIDGED_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // Abstract — sndr_io_read_stream
+    {
+        sender_thread_pool pool(1);
+        sender_as_capy_executor adapter{&pool};
+        sndr_io_read_stream_impl stream{&pool};
+        capy::run_async(adapter)(
+            capy_accept_sndr(
+                static_cast<sndr_io_read_stream&>(stream),
+                grid[run][CAPY_TASK][ABSTRACT_STREAM][BRIDGED_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // Type erased — sndr_any_read_stream
+    {
+        sender_thread_pool pool(1);
+        sender_as_capy_executor adapter{&pool};
+        sndr_any_read_stream stream(sndr_read_stream{&pool});
+        capy::run_async(adapter)(
+            capy_accept_sndr(stream, grid[run][CAPY_TASK][TYPE_ERASED_STREAM][BRIDGED_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // Synchronous — sndr_sync_read_stream
+    {
+        sender_thread_pool pool(1);
+        sender_as_capy_executor adapter{&pool};
+        sndr_sync_read_stream stream;
+        capy::run_async(adapter)(
+            capy_accept_sndr(stream, grid[run][CAPY_TASK][SYNC_STREAM][BRIDGED_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // ---------------------------------------------------------------
+    // Table 3: beman::execution::task (bex::task<void, io_env>)
+    // ---------------------------------------------------------------
+
+
+    // Native — sndr_read_stream
+    {
+        sender_thread_pool pool(1);
+        sndr_read_stream stream{&pool};
+        auto sched = pool.get_scheduler();
+        auto* mr = get_counting_resource();
+        bex::sync_wait(bex::starts_on(sched,
+            bex_accept(
+                stream, grid[run][BEMAN_TASK][NATIVE_STREAM][NATIVE_EXEC_MODEL],
+                std::allocator_arg,
+                std::pmr::polymorphic_allocator<std::byte>(mr))));
+        pool.join();
+    }
+
+    // Abstract — sndr_io_read_stream
+    {
+        sender_thread_pool pool(1);
+        sndr_io_read_stream_impl stream{&pool};
+        auto sched = pool.get_scheduler();
+        auto* mr = get_counting_resource();
+        bex::sync_wait(bex::starts_on(sched,
+            bex_accept(
+                static_cast<sndr_io_read_stream&>(stream),
+                grid[run][BEMAN_TASK][ABSTRACT_STREAM][NATIVE_EXEC_MODEL],
+                std::allocator_arg,
+                std::pmr::polymorphic_allocator<std::byte>(mr))));
+        pool.join();
+    }
+
+    // Type erased — sndr_any_read_stream
+    {
+        sender_thread_pool pool(1);
+        sndr_any_read_stream stream(sndr_read_stream{&pool});
+        auto sched = pool.get_scheduler();
+        auto* mr = get_counting_resource();
+        bex::sync_wait(bex::starts_on(sched,
+            bex_accept(
+                stream, grid[run][BEMAN_TASK][TYPE_ERASED_STREAM][NATIVE_EXEC_MODEL],
+                std::allocator_arg,
+                std::pmr::polymorphic_allocator<std::byte>(mr))));
+        pool.join();
+    }
+
+    // Synchronous — sndr_sync_read_stream
+    {
+        sender_thread_pool pool(1);
+        sndr_sync_read_stream stream;
+        auto sched = pool.get_scheduler();
+        auto* mr = get_counting_resource();
+        bex::sync_wait(bex::starts_on(sched,
+            bex_accept(
+                stream, grid[run][BEMAN_TASK][SYNC_STREAM][NATIVE_EXEC_MODEL],
+                std::allocator_arg,
+                std::pmr::polymorphic_allocator<std::byte>(mr))));
+        pool.join();
+    }
+
+    // Col B: Awaitable (via as_sender bridge)
+
+    // Native — ioaw_read_stream
+    {
+        sender_thread_pool pool(1);
+        ioaw_read_stream stream;
+        auto sched = pool.get_scheduler();
+        auto* mr = get_counting_resource();
+        bex::sync_wait(bex::starts_on(sched,
+            bex_accept_ioaw(
+                stream, grid[run][BEMAN_TASK][NATIVE_STREAM][BRIDGED_EXEC_MODEL],
+                std::allocator_arg,
+                std::pmr::polymorphic_allocator<std::byte>(mr))));
+        pool.join();
+    }
+
+    // Abstract — ioaw_io_read_stream
+    {
+        sender_thread_pool pool(1);
+        ioaw_io_read_stream_impl stream;
+        auto sched = pool.get_scheduler();
+        auto* mr = get_counting_resource();
+        bex::sync_wait(bex::starts_on(sched,
+            bex_accept_ioaw(
+                static_cast<ioaw_io_read_stream&>(stream),
+                grid[run][BEMAN_TASK][ABSTRACT_STREAM][BRIDGED_EXEC_MODEL],
+                std::allocator_arg,
+                std::pmr::polymorphic_allocator<std::byte>(mr))));
+        pool.join();
+    }
+
+    // Type erased — capy::any_read_stream
+    {
+        sender_thread_pool pool(1);
+        ioaw_read_stream concrete;
+        capy::any_read_stream stream(&concrete);
+        auto sched = pool.get_scheduler();
+        auto* mr = get_counting_resource();
+        bex::sync_wait(bex::starts_on(sched,
+            bex_accept_ioaw(
+                stream, grid[run][BEMAN_TASK][TYPE_ERASED_STREAM][BRIDGED_EXEC_MODEL],
+                std::allocator_arg,
+                std::pmr::polymorphic_allocator<std::byte>(mr))));
+        pool.join();
+    }
+
+    // Synchronous — ioaw_sync_read_stream
+    {
+        sender_thread_pool pool(1);
+        ioaw_sync_read_stream stream;
+        auto sched = pool.get_scheduler();
+        auto* mr = get_counting_resource();
+        bex::sync_wait(bex::starts_on(sched,
+            bex_accept_ioaw(
+                stream, grid[run][BEMAN_TASK][SYNC_STREAM][BRIDGED_EXEC_MODEL],
+                std::allocator_arg,
+                std::pmr::polymorphic_allocator<std::byte>(mr))));
+        pool.join();
+    }
+
+    } // for (run)
+
+    // ---------------------------------------------------------------
+    // Print results
+    // ---------------------------------------------------------------
+
+    constexpr double ops = static_cast<double>(OPS_PER_CELL);
+
+    std::printf(
+        "I/O read stream benchmark: "
+        "%d read_some calls per cell, %d runs\n",
+        OPS_PER_CELL, NUM_RUNS);
+
+    char const* row_labels[] = {
+        "Native", "Abstract", "Type-erased", "Synchronous"};
+
+    auto print_table = [&](
+        char const* title,
+        int table,
+        char const* col_a_label,
+        char const* col_b_label)
+    {
+        std::printf("\n  %s\n", title);
+        std::printf(
+            "  %-18s  %-30s  %-30s\n",
+            "", col_a_label, col_b_label);
+        std::printf(
+            "  %-18s  %-30s  %-30s\n",
+            "------------------",
+            "------------------------------",
+            "------------------------------");
+
+        for (int s = 0; s < NUM_STREAMS; ++s)
+        {
+            double sum[NUM_COLUMNS]{};
+            double sum2[NUM_COLUMNS]{};
+            double al[NUM_COLUMNS]{};
+            for (int c = 0; c < NUM_COLUMNS; ++c)
+            {
+                for (int r = 1; r <= NUM_RUNS; ++r)
+                {
+                    double v = static_cast<double>(
+                        grid[r][table][s][c].ns) / ops;
+                    sum[c] += v;
+                    sum2[c] += v * v;
+                    al[c] += static_cast<double>(
+                        grid[r][table][s][c].allocs);
+                }
+            }
+
+            double mean[NUM_COLUMNS];
+            double sd[NUM_COLUMNS];
+            double mean_al[NUM_COLUMNS];
+            for (int c = 0; c < NUM_COLUMNS; ++c)
+            {
+                mean[c] = sum[c] / NUM_RUNS;
+                double var = sum2[c] / NUM_RUNS -
+                    mean[c] * mean[c];
+                sd[c] = std::sqrt(var > 0 ? var : 0);
+                mean_al[c] = al[c] / (NUM_RUNS * ops);
+            }
+
+            std::printf(
+                "  %-18s"
+                "  %5.1f +/- %3.1f ns/op  %1.0f al/op"
+                "    %5.1f +/- %3.1f ns/op  %1.0f al/op"
+                "\n",
+                row_labels[s],
+                mean[0], sd[0], mean_al[0],
+                mean[1], sd[1], mean_al[1]);
+        }
+    };
+
+    print_table(
+        "sender/receiver pipeline",
+        SENDER_RECEIVER,
+        "A: sender (native)",
+        "B: awaitable (bridge)");
+
+    print_table(
+        "capy::task",
+        CAPY_TASK,
+        "A: awaitable (native)",
+        "B: sender (bridge)");
+
+    print_table(
+        "beman::execution::task",
+        BEMAN_TASK,
+        "A: sender (native)",
+        "B: awaitable (bridge)");
+
+    return 0;
+}
diff --git a/bench/beman/repeat_until.hpp b/bench/beman/repeat_until.hpp
new file mode 100644
index 000000000..4460adf90
--- /dev/null
+++ b/bench/beman/repeat_until.hpp
@@ -0,0 +1,184 @@
+//
+// Adapted from stdexec (Apache-2.0 WITH LLVM-exception)
+// for benchmark use.
+//
+
+#ifndef BOOST_CAPY_BENCH_REPEAT_UNTIL_HPP
+#define BOOST_CAPY_BENCH_REPEAT_UNTIL_HPP
+
+#include <beman/execution/execution.hpp>
+
+#include <optional>
+#include <system_error>
+#include <type_traits>
+#include <utility>
+
+namespace bex = beman::execution;
+
+template <bex::sender Sndr, bex::receiver Rcvr>
+struct repeat_connector
+{
+    decltype(bex::connect(
+        std::declval<Sndr>(),
+        std::declval<Rcvr>())) op;
+
+    repeat_connector(auto sndr, auto rcvr)
+        : op(bex::connect(std::move(sndr), std::move(rcvr)))
+    {}
+
+    auto start() & noexcept -> void { bex::start(op); }
+};
+
+/// Sender algorithm that repeats a child sender until
+/// a predicate returns true. Predicate is called with
+/// no arguments (child values are discarded).
+///
+/// Includes a trampoline that bounds recursion depth
+/// for synchronous completions (max_depth = 19).
+inline constexpr struct repeat_until_t
+{
+    template <bex::sender Child, typename Pred>
+    struct sender
+    {
+        using sender_concept = bex::sender_t;
+        using completion_signatures = bex::completion_signatures<
+            bex::set_value_t(),
+            bex::set_error_t(std::error_code),
+            bex::set_error_t(std::exception_ptr),
+            bex::set_stopped_t()>;
+
+        template <bex::receiver Receiver>
+        struct state
+        {
+            using operation_state_concept =
+                bex::operation_state_t;
+
+            static constexpr std::size_t max_depth = 19;
+
+            struct own_receiver
+            {
+                using receiver_concept = bex::receiver_t;
+                state* s;
+
+                auto get_env() const noexcept
+                {
+                    return bex::get_env(s->receiver);
+                }
+
+                void set_value() && noexcept
+                {
+                    s->next();
+                }
+
+                template <class... Args>
+                void set_value(Args&&...) && noexcept
+                {
+                    s->next();
+                }
+
+                void set_error(
+                    std::exception_ptr e) && noexcept
+                {
+                    bex::set_error(
+                        std::move(s->receiver),
+                        std::move(e));
+                }
+
+                void set_error(
+                    std::error_code e) && noexcept
+                {
+                    bex::set_error(
+                        std::move(s->receiver),
+                        std::move(e));
+                }
+
+                void set_stopped() && noexcept
+                {
+                    bex::set_stopped(
+                        std::move(s->receiver));
+                }
+            };
+
+            std::remove_cvref_t<Child> child;
+            std::remove_cvref_t<Pred> pred;
+            std::remove_cvref_t<Receiver> receiver;
+            std::optional<repeat_connector<
+                std::remove_cvref_t<Child>,
+                own_receiver>> child_op;
+            std::size_t depth_ = 0;
+            bool draining_ = false;
+            bool again_ = false;
+
+            auto start() & noexcept -> void
+            {
+                drain();
+            }
+
+            // Iterative trampoline that bounds stack
+            // depth for synchronous completions
+            auto drain() & noexcept -> void
+            {
+                draining_ = true;
+                do
+                {
+                    again_ = false;
+                    depth_ = 0;
+                    child_op.emplace(
+                        child, own_receiver{this});
+                    child_op->start();
+                }
+                while (again_);
+                draining_ = false;
+            }
+
+            auto next() & noexcept -> void
+            {
+                if (pred())
+                {
+                    bex::set_value(std::move(receiver));
+                    return;
+                }
+
+                if (!draining_)
+                {
+                    // Async completion — enter drain loop
+                    drain();
+                    return;
+                }
+
+                if (++depth_ >= max_depth)
+                {
+                    // Hit depth limit — trampoline
+                    again_ = true;
+                    return;
+                }
+
+                // Within limit — recurse inline
+                child_op.emplace(
+                    child, own_receiver{this});
+                child_op->start();
+            }
+        };
+
+        std::remove_cvref_t<Child> child;
+        std::remove_cvref_t<Pred> pred;
+
+        template <bex::receiver Receiver>
+        auto connect(Receiver&& rcvr) const&
+            -> state<Receiver>
+        {
+            return {child, pred,
+                std::forward<Receiver>(rcvr)};
+        }
+    };
+
+    template <bex::sender Child, typename Pred>
+    auto operator()(Child&& child, Pred&& pred) const
+        -> sender<Child, Pred>
+    {
+        return {std::forward<Child>(child),
+            std::forward<Pred>(pred)};
+    }
+} repeat_until{};
+
+#endif
diff --git a/bench/beman/sender_awaitable.hpp b/bench/beman/sender_awaitable.hpp
new file mode 100644
index 000000000..edcf4a6b3
--- /dev/null
+++ b/bench/beman/sender_awaitable.hpp
@@ -0,0 +1,430 @@
+//
+// Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_EXAMPLE_SENDER_AWAITABLE_HPP
+#define BOOST_CAPY_EXAMPLE_SENDER_AWAITABLE_HPP
+
+#include <boost/capy/error.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <beman/execution/execution.hpp>
+
+#include <atomic>
+#include <coroutine>
+#include <exception>
+#include <new>
+#include <stop_token>
+#include <system_error>
+#include <tuple>
+#include <type_traits>
+#include <variant>
+
+namespace boost::capy {
+
+namespace detail {
+
+struct stopped_t {};
+
+struct operation_cancelled {};
+
+struct bridge_env
+{
+    std::stop_token st_;
+
+    auto query(
+        beman::execution::get_stop_token_t const&)
+            const noexcept
+    {
+        return st_;
+    }
+};
+
+template<class Sender>
+using sender_single_value_t =
+    beman::execution::value_types_of_t<
+        Sender,
+        bridge_env,
+        std::tuple,
+        std::type_identity_t>;
+
+// Detect whether a sender can complete with
+// set_error(std::error_code).
+template<class Sender>
+struct has_error_code_completion
+{
+    template<class... Es>
+    struct checker
+    {
+        static constexpr bool value =
+            (std::is_same_v<
+                Es, std::error_code> || ...);
+    };
+
+    static constexpr bool value =
+        beman::execution::error_types_of_t<
+            Sender,
+            bridge_env,
+            checker>::value;
+};
+
+template<class Sender>
+constexpr bool has_error_code_v =
+    has_error_code_completion<Sender>::value;
+
+// Variant when sender can complete with
+// set_error(error_code): separate slot so
+// error_code is not wrapped in exception_ptr.
+template<class ValueTuple>
+using ec_result_variant = std::variant<
+    std::monostate,
+    ValueTuple,
+    std::error_code,
+    std::exception_ptr,
+    stopped_t>;
+
+// Variant when sender does not complete with
+// set_error(error_code).
+template<class ValueTuple>
+using no_ec_result_variant = std::variant<
+    std::monostate,
+    ValueTuple,
+    std::exception_ptr,
+    stopped_t>;
+
+template<class ValueTuple, bool HasEc>
+using result_variant = std::conditional_t<
+    HasEc,
+    ec_result_variant<ValueTuple>,
+    no_ec_result_variant<ValueTuple>>;
+
+// Bridge receiver that stores the sender's
+// completion result and resumes the coroutine.
+// Uses an atomic flag shared with await_suspend
+// to handle synchronous completion safely:
+// whichever side (set_value or await_suspend)
+// arrives second is responsible for resumption.
+template<class ValueTuple, bool HasEc>
+struct bridge_receiver
+{
+    using receiver_concept =
+        beman::execution::receiver_t;
+
+    result_variant<ValueTuple, HasEc>* result_;
+    std::coroutine_handle<>            cont_;
+    std::stop_token                    st_;
+    std::atomic<bool>*                 done_;
+
+    auto get_env() const noexcept -> bridge_env
+    {
+        return {st_};
+    }
+
+    void resume_if_ready() noexcept
+    {
+        if(done_->exchange(
+            true, std::memory_order_acq_rel))
+            cont_.resume();
+    }
+
+    template<class... Args>
+    void set_value(Args&&... args) && noexcept
+    {
+        result_->template emplace<1>(
+            std::forward<Args>(args)...);
+        resume_if_ready();
+    }
+
+    template<class E>
+    void set_error(E&& e) && noexcept
+    {
+        if constexpr (
+            HasEc &&
+            std::is_same_v<
+                std::decay_t<E>,
+                std::error_code>)
+            result_->template emplace<2>(
+                std::forward<E>(e));
+        else if constexpr (
+            std::is_same_v<
+                std::decay_t<E>,
+                std::exception_ptr>)
+        {
+            constexpr auto idx = HasEc ? 3 : 2;
+            result_->template emplace<idx>(
+                std::forward<E>(e));
+        }
+        else
+        {
+            constexpr auto idx = HasEc ? 3 : 2;
+            result_->template emplace<idx>(
+                std::make_exception_ptr(
+                    std::forward<E>(e)));
+        }
+        resume_if_ready();
+    }
+
+    void set_stopped() && noexcept
+    {
+        constexpr auto idx = HasEc ? 4 : 3;
+        result_->template emplace<idx>(
+            stopped_t{});
+        resume_if_ready();
+    }
+};
+
+} // namespace detail
+
+/** Awaitable that bridges a beman::execution
+    sender into a Capy coroutine.
+
+    Satisfies IoAwaitable. When co_awaited inside
+    a capy::task, connects the sender to a bridge
+    receiver, starts the operation, and resumes
+    the coroutine when the sender completes.
+
+    Uses an atomic exchange protocol to handle
+    senders that complete synchronously during
+    start(): whichever side arrives second
+    (receiver or await_suspend) resumes the
+    coroutine.
+
+    The bridge inspects the sender's error
+    completion signatures at compile time. If the
+    sender can complete with
+    set_error(std::error_code), await_resume
+    returns io_result so the error code is a
+    value, not an exception. Otherwise
+    await_resume returns the value directly and
+    genuine exceptions are rethrown.
+
+    @tparam Sender The beman::execution sender
+        type.
+*/
+template<class Sender>
+struct [[nodiscard]] sender_awaitable
+{
+    static constexpr bool has_ec =
+        detail::has_error_code_v<Sender>;
+
+    using value_tuple =
+        detail::sender_single_value_t<Sender>;
+    using variant_type =
+        detail::result_variant<
+            value_tuple, has_ec>;
+    using receiver_type =
+        detail::bridge_receiver<
+            value_tuple, has_ec>;
+    using op_state_type = decltype(
+        beman::execution::connect(
+            std::declval<Sender>(),
+            std::declval<receiver_type>()));
+
+    Sender sndr_;
+    variant_type result_{};
+
+    alignas(op_state_type)
+    unsigned char op_buf_[sizeof(op_state_type)];
+    bool op_constructed_ = false;
+    std::atomic<bool> done_{false};
+
+    explicit sender_awaitable(Sender sndr)
+        : sndr_(std::move(sndr))
+    {
+    }
+
+    sender_awaitable(sender_awaitable&& o)
+        noexcept(
+            std::is_nothrow_move_constructible_v<
+                Sender>)
+        : sndr_(std::move(o.sndr_))
+    {
+    }
+
+    sender_awaitable(
+        sender_awaitable const&) = delete;
+    sender_awaitable& operator=(
+        sender_awaitable const&) = delete;
+    sender_awaitable& operator=(
+        sender_awaitable&&) = delete;
+
+    ~sender_awaitable()
+    {
+        if(op_constructed_)
+            std::launder(
+                reinterpret_cast<op_state_type*>(
+                    op_buf_))->~op_state_type();
+    }
+
+    bool await_ready() const noexcept
+    {
+        return false;
+    }
+
+    std::coroutine_handle<>
+    await_suspend(
+        std::coroutine_handle<> h,
+        io_env const* env)
+    {
+        ::new(op_buf_) op_state_type(
+            beman::execution::connect(
+                std::move(sndr_),
+                receiver_type{
+                    &result_, h,
+                    env->stop_token, &done_}));
+        op_constructed_ = true;
+        beman::execution::start(
+            *std::launder(
+                reinterpret_cast<
+                    op_state_type*>(
+                        op_buf_)));
+
+        // If the sender completed during start(),
+        // the receiver already stored the result.
+        // Return h to resume without suspending.
+        if(done_.exchange(
+            true, std::memory_order_acq_rel))
+            return h;
+        return std::noop_coroutine();
+    }
+
+    auto await_resume()
+    {
+        if constexpr (has_ec)
+            return await_resume_ec();
+        else
+            return await_resume_no_ec();
+    }
+
+private:
+    // Sender can complete with
+    // set_error(error_code). Return io_result
+    // so the error code is a value, not an
+    // exception.
+    auto await_resume_ec()
+    {
+        // exception_ptr at index 3
+        if(result_.index() == 3)
+            std::rethrow_exception(
+                std::get<3>(result_));
+
+        if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 0)
+        {
+            // stopped at index 4
+            if(result_.index() == 4)
+                return io_result<>{
+                    make_error_code(
+                        error::canceled)};
+            if(result_.index() == 2)
+                return io_result<>{
+                    std::get<2>(result_)};
+            return io_result<>{};
+        }
+        else if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 1)
+        {
+            using T = std::tuple_element_t<
+                0, value_tuple>;
+            if(result_.index() == 4)
+                return io_result<T>{
+                    make_error_code(
+                        error::canceled)};
+            if(result_.index() == 2)
+                return io_result<T>{
+                    std::get<2>(result_)};
+            return io_result<T>{
+                {},
+                std::get<0>(
+                    std::get<1>(
+                        std::move(result_)))};
+        }
+        else
+        {
+            if(result_.index() == 4)
+                return io_result<value_tuple>{
+                    make_error_code(
+                        error::canceled)};
+            if(result_.index() == 2)
+                return io_result<value_tuple>{
+                    std::get<2>(result_)};
+            return io_result<value_tuple>{
+                {},
+                std::get<1>(
+                    std::move(result_))};
+        }
+    }
+
+    // Sender does not complete with
+    // set_error(error_code). Return the value
+    // directly; rethrow exceptions.
+    auto await_resume_no_ec()
+    {
+        // exception_ptr at index 2
+        if(result_.index() == 2)
+            std::rethrow_exception(
+                std::get<2>(result_));
+        // stopped at index 3
+        if(result_.index() == 3)
+            throw detail::operation_cancelled{};
+
+        if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 0)
+            return;
+        else if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 1)
+            return std::get<0>(
+                std::get<1>(
+                    std::move(result_)));
+        else
+            return std::get<1>(
+                std::move(result_));
+    }
+};
+
+/** Create an IoAwaitable from a
+    beman::execution sender.
+
+    If the sender can complete with
+    set_error(std::error_code), the returned
+    awaitable yields io_result so the error code
+    is a value, not an exception. Otherwise the
+    awaitable yields the value directly.
+
+    @par Example
+    @code
+    capy::task<int> compute(auto sched)
+    {
+        auto result = co_await await_sender(
+            beman::execution::schedule(sched)
+                | beman::execution::then(
+                    [] { return 42; }));
+        co_return result;
+    }
+    @endcode
+
+    @param sndr The sender to bridge.
+    @return An IoAwaitable that can be co_awaited
+        in a capy::task.
+*/
+template<class Sender>
+auto await_sender(Sender&& sndr)
+{
+    return sender_awaitable<
+        std::decay_t<Sender>>(
+            std::forward<Sender>(sndr));
+}
+
+} // namespace boost::capy
+
+#endif
diff --git a/bench/beman/sender_io_env.hpp b/bench/beman/sender_io_env.hpp
new file mode 100644
index 000000000..9e0df9e02
--- /dev/null
+++ b/bench/beman/sender_io_env.hpp
@@ -0,0 +1,229 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Beman execution environment for benchmarks.
+//
+// Provides pool_scheduler (the P2300 scheduler for
+// sender_thread_pool), the capy executor adapter, and
+// the io_env for beman::execution::task.
+//
+
+#ifndef BOOST_CAPY_BENCH_SENDER_IO_ENV_HPP
+#define BOOST_CAPY_BENCH_SENDER_IO_ENV_HPP
+
+#include "sender_thread_pool.hpp"
+#include "awaitable_sender.hpp"
+
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/ex/execution_context.hpp>
+
+#include <beman/execution/execution.hpp>
+#include <beman/task/task.hpp>
+
+#include <coroutine>
+#include <memory_resource>
+#include <type_traits>
+#include <utility>
+
+// Adapter making sender_thread_pool satisfy capy's
+// Executor concept so capy::task can run on it.
+struct sender_as_capy_executor
+{
+    sender_thread_pool* pool_;
+
+    boost::capy::execution_context& context() const noexcept
+    {
+        return *pool_;
+    }
+
+    void on_work_started() const noexcept
+    {
+        pool_->on_work_started();
+    }
+
+    void on_work_finished() const noexcept
+    {
+        pool_->on_work_finished();
+    }
+
+    void post(boost::capy::continuation& c) const;
+
+    // Return the handle for symmetric transfer so the
+    // caller resumes the coroutine inline. Posting would
+    // cause a lifetime issue since run_async expects to
+    // hand off ownership via symmetric transfer.
+    std::coroutine_handle<>
+    dispatch(boost::capy::continuation& c) const
+    {
+        return c.h;
+    }
+
+    bool operator==(
+        sender_as_capy_executor const&) const noexcept = default;
+};
+
+namespace ex = beman::execution;
+
+struct pool_scheduler
+{
+    using scheduler_concept = ex::scheduler_t;
+
+    sender_thread_pool* pool_;
+
+    struct env
+    {
+        sender_thread_pool* pool_;
+
+        auto query(
+            ex::get_completion_scheduler_t<ex::set_value_t> const&
+        ) const noexcept
+        {
+            return pool_scheduler{pool_};
+        }
+    };
+
+    template <ex::receiver Receiver>
+    struct op_state : work_item
+    {
+        using operation_state_concept = ex::operation_state_t;
+
+        std::remove_cvref_t<Receiver> rcvr_;
+        sender_thread_pool* pool_;
+
+        op_state(Receiver rcvr, sender_thread_pool* pool)
+            : rcvr_(std::move(rcvr))
+            , pool_(pool)
+        {}
+
+        op_state(op_state const&) = delete;
+        op_state(op_state&&) = delete;
+        op_state& operator=(op_state const&) = delete;
+        op_state& operator=(op_state&&) = delete;
+
+        void execute() noexcept override
+        {
+            ex::set_value(std::move(rcvr_));
+        }
+
+        void start() & noexcept
+        {
+            pool_->enqueue(this);
+        }
+    };
+
+    struct sender
+    {
+        using sender_concept = ex::sender_t;
+        using completion_signatures =
+            ex::completion_signatures<ex::set_value_t()>;
+
+        sender_thread_pool* pool_;
+
+        auto get_env() const noexcept { return env{pool_}; }
+
+        template <ex::receiver Receiver>
+        auto connect(Receiver&& rcvr)
+            -> op_state<std::remove_cvref_t<Receiver>>
+        {
+            return {std::forward<Receiver>(rcvr), pool_};
+        }
+    };
+
+    auto query(
+        boost::capy::get_io_executor_t const&
+    ) const noexcept -> sender_as_capy_executor
+    {
+        return sender_as_capy_executor{pool_};
+    }
+
+    auto schedule() -> sender { return {pool_}; }
+    bool operator==(pool_scheduler const&) const = default;
+};
+
+inline pool_scheduler
+sender_thread_pool::get_scheduler() noexcept
+{
+    return pool_scheduler{this};
+}
+
+// P2300 has no post(coroutine_handle<>). To resume a
+// coroutine on a scheduler you must go through
+// schedule → connect → start. The operation state
+// must be heap-allocated because the coroutine is
+// suspended and cannot host it.
+struct scheduled_resume
+{
+    struct receiver
+    {
+        using receiver_concept = ex::receiver_t;
+
+        scheduled_resume* self_;
+
+        void set_value() && noexcept
+        {
+            auto h = self_->h_;
+            delete self_;
+            h.resume();
+        }
+
+        void set_error(auto&&) && noexcept
+        {
+            std::terminate();
+        }
+
+        void set_stopped() && noexcept
+        {
+            std::terminate();
+        }
+    };
+
+    using op_state_t =
+        pool_scheduler::op_state<receiver>;
+
+    std::coroutine_handle<> h_;
+    op_state_t op_;
+
+    scheduled_resume(
+        pool_scheduler sched,
+        std::coroutine_handle<> h)
+        : h_(h)
+        , op_(ex::connect(
+            sched.schedule(),
+            receiver{this}))
+    {}
+};
+
+inline void sender_as_capy_executor::post(
+    boost::capy::continuation& c) const
+{
+    auto* p = new scheduled_resume(
+        pool_scheduler{pool_}, c.h);
+    ex::start(p->op_);
+}
+
+struct io_env
+{
+    using scheduler_type = pool_scheduler;
+    using allocator_type = std::pmr::polymorphic_allocator<std::byte>;
+
+    sender_thread_pool* pool_ = nullptr;
+
+    io_env() = default;
+
+    template <typename Env>
+        requires requires(Env const& e) {
+            pool_scheduler{ex::get_scheduler(e)};
+        }
+    io_env(Env const& e)
+        : pool_(pool_scheduler{ex::get_scheduler(e)}.pool_)
+    {}
+};
+
+#endif
diff --git a/bench/beman/sender_thread_pool.hpp b/bench/beman/sender_thread_pool.hpp
new file mode 100644
index 000000000..fc3f8f849
--- /dev/null
+++ b/bench/beman/sender_thread_pool.hpp
@@ -0,0 +1,162 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Minimal thread pool for sender benchmarks.
+//
+// sender_thread_pool is the execution context.
+// pool_scheduler (defined in sender_io_env.hpp)
+// is the P2300 scheduler handle.
+//
+
+#ifndef BOOST_CAPY_BENCH_SENDER_THREAD_POOL_HPP
+#define BOOST_CAPY_BENCH_SENDER_THREAD_POOL_HPP
+
+#include "thread_pool.hpp"
+
+#include <boost/capy/ex/execution_context.hpp>
+
+#include <atomic>
+#include <condition_variable>
+#include <cstddef>
+#include <mutex>
+#include <thread>
+#include <vector>
+
+struct pool_scheduler;
+
+class sender_thread_pool
+    : public boost::capy::execution_context
+{
+    std::mutex mutex_;
+    std::condition_variable work_cv_;
+    std::condition_variable done_cv_;
+    intrusive_queue<work_item> q_;
+    std::vector<std::thread> threads_;
+    std::atomic<std::size_t> outstanding_work_{0};
+    bool stop_{false};
+    bool joined_{false};
+    std::size_t num_threads_;
+    std::once_flag start_flag_;
+
+    void ensure_started()
+    {
+        std::call_once(start_flag_, [this] {
+            threads_.reserve(num_threads_);
+            for (std::size_t i = 0; i < num_threads_; ++i)
+                threads_.emplace_back([this] { run(); });
+        });
+    }
+
+    void run()
+    {
+        for (;;)
+        {
+            work_item* w = nullptr;
+            {
+                std::unique_lock lock(mutex_);
+                work_cv_.wait(lock, [this] {
+                    return !q_.empty() || stop_;
+                });
+                if (stop_)
+                    return;
+                w = q_.pop();
+            }
+            if (w)
+                w->execute();
+        }
+    }
+
+public:
+    explicit sender_thread_pool(std::size_t num_threads = 0)
+        : execution_context(this)
+        , num_threads_(num_threads == 0
+            ? (std::max)(std::thread::hardware_concurrency(), 1u)
+            : num_threads)
+    {}
+
+    ~sender_thread_pool()
+    {
+        stop();
+        join();
+        shutdown();
+        destroy();
+    }
+
+    sender_thread_pool(sender_thread_pool const&) = delete;
+    sender_thread_pool& operator=(sender_thread_pool const&) = delete;
+
+    // Defined in sender_io_env.hpp after pool_scheduler
+    pool_scheduler get_scheduler() noexcept;
+
+    void enqueue(work_item* w)
+    {
+        ensure_started();
+        {
+            std::lock_guard lock(mutex_);
+            q_.push(w);
+        }
+        work_cv_.notify_one();
+    }
+
+    void on_work_started() noexcept
+    {
+        outstanding_work_.fetch_add(1, std::memory_order_acq_rel);
+    }
+
+    void on_work_finished() noexcept
+    {
+        if (outstanding_work_.fetch_sub(
+            1, std::memory_order_acq_rel) == 1)
+        {
+            std::lock_guard lock(mutex_);
+            if (joined_ && !stop_)
+                stop_ = true;
+            done_cv_.notify_all();
+            work_cv_.notify_all();
+        }
+    }
+
+    void join() noexcept
+    {
+        {
+            std::unique_lock lock(mutex_);
+            if (joined_)
+                return;
+            joined_ = true;
+
+            if (outstanding_work_.load(
+                std::memory_order_acquire) == 0)
+            {
+                stop_ = true;
+                work_cv_.notify_all();
+            }
+            else
+            {
+                done_cv_.wait(lock, [this] { return stop_; });
+            }
+        }
+
+        for (auto& t : threads_)
+            if (t.joinable())
+                t.join();
+    }
+
+    void stop() noexcept
+    {
+        {
+            std::lock_guard lock(mutex_);
+            stop_ = true;
+        }
+        work_cv_.notify_all();
+        done_cv_.notify_all();
+    }
+};
+
+#endif
diff --git a/bench/beman/sndr_any_read_sender.hpp b/bench/beman/sndr_any_read_sender.hpp
new file mode 100644
index 000000000..9063b10cf
--- /dev/null
+++ b/bench/beman/sndr_any_read_sender.hpp
@@ -0,0 +1,267 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Type-erased sender for benchmarks.
+//
+// sndr_any_read_sender wraps a concrete sender behind a virtual
+// interface. connect() heap-allocates the operation state because
+// its type is erased.
+//
+
+#ifndef BOOST_CAPY_BENCH_SNDR_ANY_READ_SENDER_HPP
+#define BOOST_CAPY_BENCH_SNDR_ANY_READ_SENDER_HPP
+
+#include <beman/execution/execution.hpp>
+
+#include <coroutine>
+#include <cstddef>
+#include <cstring>
+#include <memory>
+#include <memory_resource>
+#include <utility>
+
+auto get_counting_resource() -> std::pmr::memory_resource*;
+
+namespace ex = beman::execution;
+
+class sndr_any_read_sender
+{
+public:
+    struct op_base
+    {
+        static void* operator new(std::size_t n)
+        {
+            return get_counting_resource()
+                ->allocate(n, alignof(std::max_align_t));
+        }
+
+        static void operator delete(void* p, std::size_t n) noexcept
+        {
+            get_counting_resource()
+                ->deallocate(p, n, alignof(std::max_align_t));
+        }
+
+        virtual void start() noexcept = 0;
+        virtual ~op_base() = default;
+    };
+
+private:
+    struct callback_receiver
+    {
+        using receiver_concept = ex::receiver_t;
+
+        void* data_;
+        void (*on_value_)(void*, std::size_t) noexcept;
+        void (*on_stopped_)(void*) noexcept;
+
+        struct env_t {};
+        auto get_env() const noexcept -> env_t { return {}; }
+
+        void set_value(std::size_t n) && noexcept
+        {
+            on_value_(data_, n);
+        }
+
+        void set_stopped() && noexcept
+        {
+            on_stopped_(data_);
+        }
+
+        template <class E>
+        void set_error(E&&) && noexcept
+        {
+            std::terminate();
+        }
+    };
+
+    using factory_fn = std::unique_ptr<op_base>(*)(
+        void* sender_buf, callback_receiver cr);
+    using destroy_fn = void(*)(void* sender_buf) noexcept;
+
+    static constexpr std::size_t buf_size = 64;
+    alignas(std::max_align_t) char buf_[buf_size];
+    factory_fn factory_;
+    destroy_fn destroy_;
+
+public:
+    using sender_concept = ex::sender_t;
+    using completion_signatures =
+        ex::completion_signatures<ex::set_value_t(std::size_t)>;
+
+    template <class Sender>
+    explicit sndr_any_read_sender(Sender s)
+    {
+        static_assert(sizeof(Sender) <= buf_size);
+        static_assert(alignof(Sender) <= alignof(std::max_align_t));
+        new (buf_) Sender(std::move(s));
+
+        factory_ = +[](void* stor,
+            callback_receiver r) -> std::unique_ptr<op_base>
+        {
+            auto& sndr = *static_cast<Sender*>(stor);
+
+            using inner_op_t = decltype(ex::connect(
+                std::declval<Sender>(),
+                std::declval<callback_receiver>()));
+
+            struct concrete_op : op_base
+            {
+                inner_op_t inner_;
+                concrete_op(Sender s, callback_receiver r)
+                    : inner_(ex::connect(
+                        std::move(s), std::move(r))) {}
+                void start() noexcept override
+                {
+                    ex::start(inner_);
+                }
+            };
+
+            return std::make_unique<concrete_op>(
+                std::move(sndr), std::move(r));
+        };
+
+        destroy_ = +[](void* stor) noexcept {
+            static_cast<Sender*>(stor)->~Sender();
+        };
+    }
+
+    ~sndr_any_read_sender() { destroy_(buf_); }
+
+    sndr_any_read_sender(sndr_any_read_sender const&) = delete;
+    sndr_any_read_sender& operator=(sndr_any_read_sender const&) = delete;
+
+    sndr_any_read_sender(sndr_any_read_sender&& o) noexcept
+        : factory_(o.factory_), destroy_(o.destroy_)
+    {
+        std::memcpy(buf_, o.buf_, buf_size);
+        o.destroy_ = +[](void*) noexcept {};
+    }
+
+    sndr_any_read_sender& operator=(sndr_any_read_sender&&) = delete;
+
+    /// Connect a callback receiver for sender/receiver pipeline use.
+    auto connect(
+        void* data,
+        void (*on_value)(void*, std::size_t) noexcept,
+        void (*on_stopped)(void*) noexcept)
+        -> std::unique_ptr<op_base>
+    {
+        return factory_(buf_,
+            callback_receiver{data, on_value, on_stopped});
+    }
+
+    /// Standard connect for ex::connect CPO. Defers the factory
+    /// call to start() so the callback points to the final address.
+    template <ex::receiver Receiver>
+    struct bridge_op
+    {
+        using operation_state_concept = ex::operation_state_t;
+
+        std::remove_cvref_t<Receiver> rcvr_;
+        factory_fn factory_;
+        destroy_fn destroy_;
+        alignas(std::max_align_t) char sbuf_[buf_size];
+        std::unique_ptr<op_base> inner_;
+
+        bridge_op(Receiver rcvr, sndr_any_read_sender&& sndr)
+            : rcvr_(std::move(rcvr))
+            , factory_(sndr.factory_)
+            , destroy_(sndr.destroy_)
+        {
+            std::memcpy(sbuf_, sndr.buf_, buf_size);
+            sndr.destroy_ = +[](void*) noexcept {};
+        }
+
+        ~bridge_op() { destroy_(sbuf_); }
+
+        bridge_op(bridge_op const&) = delete;
+        bridge_op(bridge_op&&) = delete;
+        bridge_op& operator=(bridge_op const&) = delete;
+        bridge_op& operator=(bridge_op&&) = delete;
+
+        void start() & noexcept
+        {
+            inner_ = factory_(sbuf_, callback_receiver{
+                this,
+                +[](void* p, std::size_t n) noexcept {
+                    auto* self = static_cast<bridge_op*>(p);
+                    ex::set_value(std::move(self->rcvr_), n);
+                },
+                +[](void* p) noexcept {
+                    auto* self = static_cast<bridge_op*>(p);
+                    ex::set_stopped(std::move(self->rcvr_));
+                }
+            });
+            inner_->start();
+        }
+    };
+
+    template <ex::receiver Receiver>
+    auto connect(Receiver&& rcvr) &&
+        -> bridge_op<std::remove_cvref_t<Receiver>>
+    {
+        return {std::forward<Receiver>(rcvr), std::move(*this)};
+    }
+
+    template <typename Promise>
+    auto as_awaitable(Promise&)
+    {
+        struct aw
+        {
+            alignas(std::max_align_t) char buf_[buf_size];
+            factory_fn factory_;
+            destroy_fn destroy_;
+            std::unique_ptr<op_base> inner_;
+            std::coroutine_handle<> cont_{};
+            std::size_t result_{};
+
+            explicit aw(sndr_any_read_sender& sndr)
+                : factory_(sndr.factory_)
+                , destroy_(sndr.destroy_)
+            {
+                std::memcpy(buf_, sndr.buf_, buf_size);
+                sndr.destroy_ = +[](void*) noexcept {};
+            }
+
+            ~aw() { destroy_(buf_); }
+
+            aw(aw const&) = delete;
+            aw(aw&&) = delete;
+            aw& operator=(aw const&) = delete;
+            aw& operator=(aw&&) = delete;
+
+            bool await_ready() const noexcept { return false; }
+
+            void await_suspend(
+                std::coroutine_handle<> h) noexcept
+            {
+                cont_ = h;
+                inner_ = factory_(buf_, callback_receiver{
+                    this,
+                    +[](void* p, std::size_t n) noexcept {
+                        auto* a = static_cast<aw*>(p);
+                        a->result_ = n;
+                        a->cont_.resume();
+                    },
+                    +[](void* p) noexcept {
+                        auto* a = static_cast<aw*>(p);
+                        a->cont_.resume();
+                    }
+                });
+                inner_->start();
+            }
+
+            std::size_t await_resume() noexcept { return result_; }
+        };
+        return aw{*this};
+    }
+};
+
+#endif
diff --git a/bench/beman/sndr_any_read_stream.hpp b/bench/beman/sndr_any_read_stream.hpp
new file mode 100644
index 000000000..fb021ffda
--- /dev/null
+++ b/bench/beman/sndr_any_read_stream.hpp
@@ -0,0 +1,68 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_SNDR_ANY_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_SNDR_ANY_READ_STREAM_HPP
+
+#include "sndr_any_read_sender.hpp"
+
+#include <boost/capy/buffers.hpp>
+
+#include <utility>
+
+/// Standalone value-type erased sender stream.
+///
+/// Mirrors capy::any_read_stream: stores any sender stream behind
+/// a vtable, heap-allocated. Does NOT inherit from
+/// sndr_io_read_stream — this is a fully independent erasure
+/// mechanism.
+class sndr_any_read_stream
+{
+    using read_some_fn = sndr_any_read_sender(*)(
+        void*, boost::capy::mutable_buffer);
+    using destroy_fn = void(*)(void*) noexcept;
+
+    void* stream_;
+    read_some_fn read_some_;
+    destroy_fn destroy_;
+
+public:
+    template <class Stream>
+    explicit sndr_any_read_stream(Stream s)
+    {
+        stream_ = new Stream(std::move(s));
+
+        read_some_ = +[](void* stor,
+            boost::capy::mutable_buffer buf)
+            -> sndr_any_read_sender
+        {
+            auto& stream = *static_cast<Stream*>(stor);
+            return sndr_any_read_sender{stream.read_some(buf)};
+        };
+
+        destroy_ = +[](void* stor) noexcept {
+            delete static_cast<Stream*>(stor);
+        };
+    }
+
+    ~sndr_any_read_stream() { destroy_(stream_); }
+
+    sndr_any_read_stream(sndr_any_read_stream const&) = delete;
+    sndr_any_read_stream& operator=(sndr_any_read_stream const&) = delete;
+    sndr_any_read_stream(sndr_any_read_stream&&) = delete;
+    sndr_any_read_stream& operator=(sndr_any_read_stream&&) = delete;
+
+    sndr_any_read_sender
+        read_some(boost::capy::mutable_buffer buf)
+    {
+        return read_some_(stream_, buf);
+    }
+};
+
+#endif
diff --git a/bench/beman/sndr_io_read_stream.hpp b/bench/beman/sndr_io_read_stream.hpp
new file mode 100644
index 000000000..002a6522c
--- /dev/null
+++ b/bench/beman/sndr_io_read_stream.hpp
@@ -0,0 +1,41 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_SNDR_IO_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_SNDR_IO_READ_STREAM_HPP
+
+#include "sndr_any_read_sender.hpp"
+#include "sndr_read_stream.hpp"
+
+#include <boost/capy/buffers.hpp>
+
+/// Abstract interface for sender-based read streams.
+struct sndr_io_read_stream
+{
+    virtual sndr_any_read_sender
+        read_some(boost::capy::mutable_buffer) = 0;
+    virtual ~sndr_io_read_stream() = default;
+};
+
+/// Concrete implementation wrapping sndr_read_stream.
+struct sndr_io_read_stream_impl : sndr_io_read_stream
+{
+    sndr_read_stream stream_;
+
+    explicit sndr_io_read_stream_impl(sender_thread_pool* pool)
+        : stream_{pool} {}
+
+    sndr_any_read_sender
+        read_some(boost::capy::mutable_buffer buf) override
+    {
+        return sndr_any_read_sender{stream_.read_some(buf)};
+    }
+};
+
+#endif
diff --git a/bench/beman/sndr_read_stream.hpp b/bench/beman/sndr_read_stream.hpp
new file mode 100644
index 000000000..5b626e010
--- /dev/null
+++ b/bench/beman/sndr_read_stream.hpp
@@ -0,0 +1,121 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// No-op sender stream for benchmarks.
+//
+// The stream holds a sender_thread_pool* (I/O context
+// handle), analogous to how a socket holds a reference
+// to its execution context. read_some() returns a sender
+// that captures this handle. The sender provides both
+// as_awaitable (for coroutine consumption) and connect
+// (for sender pipeline consumption).
+//
+
+#ifndef BOOST_CAPY_BENCH_SNDR_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_SNDR_READ_STREAM_HPP
+
+#include "sender_thread_pool.hpp"
+#include "thread_pool.hpp"
+
+#include <beman/execution/execution.hpp>
+
+#include <coroutine>
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+namespace ex = beman::execution;
+
+struct sndr_read_stream
+{
+    sender_thread_pool* pool_;
+
+    struct read_sender
+    {
+        using sender_concept = ex::sender_t;
+        using completion_signatures =
+            ex::completion_signatures<ex::set_value_t(std::size_t)>;
+
+        sender_thread_pool* pool_;
+
+        // awaitable path (co_awaited from io_task via as_awaitable)
+        template <typename Promise>
+        struct awaitable : work_item
+        {
+            sender_thread_pool* pool_;
+            std::coroutine_handle<> h_{};
+
+            explicit awaitable(sender_thread_pool* pool) noexcept
+                : pool_(pool) {}
+
+            bool await_ready() const noexcept { return false; }
+
+            void await_suspend(std::coroutine_handle<> h)
+            {
+                h_ = h;
+                pool_->enqueue(this);
+            }
+
+            std::size_t await_resume() noexcept { return 0; }
+
+            void execute() noexcept override { h_.resume(); }
+        };
+
+        template <typename Promise>
+        auto as_awaitable(Promise&) -> awaitable<Promise>
+        {
+            return awaitable<Promise>{pool_};
+        }
+
+        // sender path (consumed via ex::connect)
+        template <ex::receiver Receiver>
+        struct op_state : work_item
+        {
+            using operation_state_concept = ex::operation_state_t;
+
+            std::remove_cvref_t<Receiver> rcvr_;
+            sender_thread_pool* pool_;
+
+            op_state(Receiver rcvr, sender_thread_pool* pool)
+                : rcvr_(std::move(rcvr))
+                , pool_(pool)
+            {}
+
+            op_state(op_state const&) = delete;
+            op_state(op_state&&) = delete;
+            op_state& operator=(op_state const&) = delete;
+            op_state& operator=(op_state&&) = delete;
+
+            void execute() noexcept override
+            {
+                ex::set_value(std::move(rcvr_), std::size_t{0});
+            }
+
+            void start() & noexcept
+            {
+                pool_->enqueue(this);
+            }
+        };
+
+        template <ex::receiver Receiver>
+        auto connect(Receiver&& rcvr)
+            -> op_state<std::remove_cvref_t<Receiver>>
+        {
+            return {std::forward<Receiver>(rcvr), pool_};
+        }
+    };
+
+    read_sender read_some(auto)
+    {
+        return {pool_};
+    }
+};
+
+#endif
diff --git a/bench/beman/sndr_sync_read_stream.hpp b/bench/beman/sndr_sync_read_stream.hpp
new file mode 100644
index 000000000..1bebe63f9
--- /dev/null
+++ b/bench/beman/sndr_sync_read_stream.hpp
@@ -0,0 +1,113 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Synchronous-completion sender stream.
+//
+// Every read completes immediately. The sender's
+// start() calls set_value synchronously. The
+// as_awaitable path returns the coroutine handle
+// for symmetric transfer.
+//
+// repeat_until's trampoline bounds stack depth for
+// sender pipelines. Coroutines handle this via
+// symmetric transfer.
+//
+
+#ifndef BOOST_CAPY_BENCH_SNDR_SYNC_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_SNDR_SYNC_READ_STREAM_HPP
+
+#include <beman/execution/execution.hpp>
+
+#include <coroutine>
+#include <cstddef>
+#include <type_traits>
+#include <utility>
+
+namespace ex = beman::execution;
+
+struct sndr_sync_read_stream
+{
+    struct read_sender
+    {
+        using sender_concept = ex::sender_t;
+        using completion_signatures =
+            ex::completion_signatures<ex::set_value_t(std::size_t)>;
+
+        // awaitable path (co_awaited from bex::task via as_awaitable)
+        template <typename Promise>
+        struct awaitable
+        {
+            bool await_ready() const noexcept
+            {
+                return false;
+            }
+
+            std::coroutine_handle<>
+            await_suspend(std::coroutine_handle<> h)
+            {
+                // Data already buffered — resume inline
+                return h;
+            }
+
+            std::size_t await_resume() noexcept
+            {
+                return 0;
+            }
+        };
+
+        template <typename Promise>
+        auto as_awaitable(Promise&) -> awaitable<Promise>
+        {
+            return {};
+        }
+
+        // sender path (consumed via ex::connect)
+        template <ex::receiver Receiver>
+        struct op_state
+        {
+            using operation_state_concept =
+                ex::operation_state_t;
+
+            std::remove_cvref_t<Receiver> rcvr_;
+
+            op_state(Receiver rcvr)
+                : rcvr_(std::move(rcvr))
+            {}
+
+            op_state(op_state const&) = delete;
+            op_state(op_state&&) = delete;
+            op_state& operator=(op_state const&) = delete;
+            op_state& operator=(op_state&&) = delete;
+
+            void start() & noexcept
+            {
+                // Synchronous completion — causes
+                // stack overflow in loop algorithms
+                // without a trampoline
+                ex::set_value(
+                    std::move(rcvr_), std::size_t{0});
+            }
+        };
+
+        template <ex::receiver Receiver>
+        auto connect(Receiver&& rcvr)
+            -> op_state<std::remove_cvref_t<Receiver>>
+        {
+            return {std::forward<Receiver>(rcvr)};
+        }
+    };
+
+    read_sender read_some(auto)
+    {
+        return {};
+    }
+};
+
+#endif
diff --git a/bench/beman/thread_pool.hpp b/bench/beman/thread_pool.hpp
new file mode 100644
index 000000000..221ebaec8
--- /dev/null
+++ b/bench/beman/thread_pool.hpp
@@ -0,0 +1,69 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Intrusive queue and work_item base for benchmarks.
+//
+
+#ifndef BOOST_CAPY_BENCH_THREAD_POOL_HPP
+#define BOOST_CAPY_BENCH_THREAD_POOL_HPP
+
+#include <cstddef>
+
+template <typename T>
+class intrusive_queue
+{
+public:
+    class node
+    {
+        friend class intrusive_queue;
+        T* next_;
+    };
+
+private:
+    T* head_ = nullptr;
+    T* tail_ = nullptr;
+
+public:
+    intrusive_queue() = default;
+    intrusive_queue(intrusive_queue const&) = delete;
+    intrusive_queue& operator=(intrusive_queue const&) = delete;
+
+    bool empty() const noexcept { return head_ == nullptr; }
+
+    void push(T* w) noexcept
+    {
+        w->next_ = nullptr;
+        if (tail_)
+            tail_->next_ = w;
+        else
+            head_ = w;
+        tail_ = w;
+    }
+
+    T* pop() noexcept
+    {
+        if (!head_)
+            return nullptr;
+        T* w = head_;
+        head_ = head_->next_;
+        if (!head_)
+            tail_ = nullptr;
+        return w;
+    }
+};
+
+struct work_item : intrusive_queue<work_item>::node
+{
+    virtual void execute() noexcept = 0;
+protected:
+    ~work_item() = default;
+};
+
+#endif
diff --git a/bench/bench.cpp b/bench/bench.cpp
index 3c1720f15..b80624195 100644
--- a/bench/bench.cpp
+++ b/bench/bench.cpp
@@ -81,14 +81,14 @@ class bench_io_context::executor_type
     {
     }
 
-    std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const
+    std::coroutine_handle<> dispatch(continuation& c) const
     {
-        return h;
+        return c.h;
     }
 
-    void post(std::coroutine_handle<> h) const
+    void post(continuation& c) const
     {
-        h.resume();
+        c.h.resume();
     }
 
     void defer(std::coroutine_handle<> h) const
diff --git a/bench/stdexec/allocation_tracker.hpp b/bench/stdexec/allocation_tracker.hpp
new file mode 100644
index 000000000..e8c43f5d0
--- /dev/null
+++ b/bench/stdexec/allocation_tracker.hpp
@@ -0,0 +1,71 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_ALLOCATION_TRACKER_HPP
+#define BOOST_CAPY_BENCH_ALLOCATION_TRACKER_HPP
+
+#include <atomic>
+#include <cstddef>
+#include <cstdlib>
+#include <memory_resource>
+#include <new>
+
+static std::atomic<int64_t> g_alloc_count{0};
+
+/// Counts every allocate call, then delegates to upstream.
+class counting_memory_resource
+    : public std::pmr::memory_resource
+{
+    std::pmr::memory_resource* upstream_;
+
+    void* do_allocate(
+        std::size_t n, std::size_t align) override
+    {
+        g_alloc_count.fetch_add(1, std::memory_order_relaxed);
+        return upstream_->allocate(n, align);
+    }
+
+    void do_deallocate(
+        void* p, std::size_t n, std::size_t align) override
+    {
+        upstream_->deallocate(p, n, align);
+    }
+
+    bool do_is_equal(
+        memory_resource const& other) const noexcept override
+    {
+        return this == &other;
+    }
+
+public:
+    explicit counting_memory_resource(
+        std::pmr::memory_resource* upstream) noexcept
+        : upstream_(upstream) {}
+};
+
+void* operator new(std::size_t n)
+{
+    g_alloc_count.fetch_add(1, std::memory_order_relaxed);
+    void* p = std::malloc(n);
+    if (!p)
+        throw std::bad_alloc();
+    return p;
+}
+
+void operator delete(void* p) noexcept
+{
+    std::free(p);
+}
+
+void operator delete(void* p, std::size_t) noexcept
+{
+    std::free(p);
+}
+
+#endif
diff --git a/bench/stdexec/awaitable_sender.hpp b/bench/stdexec/awaitable_sender.hpp
new file mode 100644
index 000000000..5f54601f4
--- /dev/null
+++ b/bench/stdexec/awaitable_sender.hpp
@@ -0,0 +1,567 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_AWAITABLE_SENDER_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_AWAITABLE_SENDER_HPP
+
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/detail/await_suspend_helper.hpp>
+#include <boost/capy/ex/executor_ref.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <stdexec/execution.hpp>
+
+#include <concepts>
+#include <coroutine>
+#include <exception>
+#include <stop_token>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace boost::capy {
+
+// Query CPO for obtaining a Capy-compatible executor
+// from a P2300 environment. The returned object must
+// satisfy Capy's Executor concept. Environments that
+// host IoAwaitables via the as_sender bridge must
+// answer this query.
+struct get_io_executor_t
+{
+    static consteval auto query(
+        stdexec::forwarding_query_t) noexcept -> bool
+    {
+        return true;
+    }
+
+    template<class Env>
+        requires requires(Env const& env) {
+            env.query(
+                std::declval<get_io_executor_t const&>());
+        }
+    auto operator()(Env const& env) const noexcept
+    {
+        return env.query(*this);
+    }
+};
+
+inline constexpr get_io_executor_t get_io_executor{};
+
+namespace detail {
+
+template<class T, class = void>
+struct has_tuple_protocol : std::false_type {};
+
+template<class T>
+struct has_tuple_protocol<T,
+    std::void_t<
+        typename std::tuple_size<T>::type,
+        typename std::tuple_element<0, T>::type>>
+    : std::true_type {};
+
+template<class T, bool = has_tuple_protocol<T>::value>
+struct is_ec_outcome : std::is_same<T, std::error_code> {};
+
+template<class T>
+struct is_ec_outcome<T, true>
+    : std::bool_constant<
+        std::tuple_size_v<T> == 1 &&
+        std::is_same_v<
+            std::tuple_element_t<0, T>,
+            std::error_code>>
+{};
+
+template<class T>
+constexpr bool is_ec_outcome_v =
+    std::is_same_v<T, std::error_code> ||
+    is_ec_outcome<T>::value;
+
+template<class T, bool = has_tuple_protocol<T>::value>
+struct is_compound_ec_result : std::false_type {};
+
+template<class T>
+struct is_compound_ec_result<T, true>
+    : std::bool_constant<
+        std::tuple_size_v<T> >= 2 &&
+        std::is_same_v<
+            std::tuple_element_t<0, T>,
+            std::error_code>>
+{};
+
+template<class T>
+constexpr bool is_compound_ec_result_v =
+    is_compound_ec_result<T>::value;
+
+struct frame_cb
+{
+    void (*resume)(frame_cb*);
+    void (*destroy)(frame_cb*);
+    void* data;
+};
+
+// Return the concrete executor by value, trying get_io_executor
+// on the env first, then falling back to the start scheduler.
+template<class Env>
+auto resolve_executor(Env const& env)
+{
+    if constexpr (requires { get_io_executor(env); })
+        return get_io_executor(env);
+    else
+        return stdexec::get_start_scheduler(env)
+            .query(get_io_executor_t{});
+}
+
+} // namespace detail
+
+/** Sender that wraps an IoAwaitable.
+
+    When connected or co_awaited, the bridge queries
+    the receiver's or promise's environment for a
+    Capy-compatible executor via get_io_executor.
+    The executor is stored by value in the operation
+    state and used to construct the io_env passed to
+    the IoAwaitable's await_suspend.
+
+    @tparam IoAw The IoAwaitable type.
+*/
+template<class IoAw>
+struct awaitable_sender
+{
+    using sender_concept = stdexec::sender_tag;
+
+    using result_type = decltype(
+        std::declval<std::decay_t<IoAw>&>().await_resume());
+
+    static auto make_sigs()
+    {
+        if constexpr (std::is_void_v<result_type>)
+            return stdexec::completion_signatures<
+                stdexec::set_value_t(),
+                stdexec::set_error_t(std::exception_ptr),
+                stdexec::set_stopped_t()>{};
+        else if constexpr (
+            detail::is_compound_ec_result_v<result_type>)
+            return stdexec::completion_signatures<
+                stdexec::set_value_t(
+                    std::tuple_element_t<1, result_type>),
+                stdexec::set_error_t(std::error_code),
+                stdexec::set_error_t(std::exception_ptr),
+                stdexec::set_stopped_t()>{};
+        else if constexpr (
+            detail::is_ec_outcome_v<result_type>)
+            return stdexec::completion_signatures<
+                stdexec::set_value_t(),
+                stdexec::set_error_t(std::error_code),
+                stdexec::set_error_t(std::exception_ptr),
+                stdexec::set_stopped_t()>{};
+        else
+            return stdexec::completion_signatures<
+                stdexec::set_value_t(result_type),
+                stdexec::set_error_t(std::exception_ptr),
+                stdexec::set_stopped_t()>{};
+    }
+
+    using completion_signatures = decltype(make_sigs());
+
+    IoAw aw_;
+
+    template<class Receiver>
+    struct op_state
+    {
+        using operation_state_concept =
+            stdexec::operation_state_tag;
+
+        // Concrete executor type deduced from the receiver's
+        // environment. Stored by value to avoid the dangling
+        // pointer that executor_ref would produce when the
+        // source is a temporary (scheduler query or prop).
+        using executor_type = decltype(
+            detail::resolve_executor(
+                stdexec::get_env(
+                    std::declval<Receiver const&>())));
+
+        IoAw aw_;
+        Receiver rcvr_;
+        executor_type ex_;
+        io_env env_;
+        detail::frame_cb cb_;
+
+        op_state(IoAw aw, Receiver rcvr)
+            : aw_(std::move(aw))
+            , rcvr_(std::move(rcvr))
+            , ex_{}
+            , cb_{}
+        {
+        }
+
+        op_state(op_state const&) = delete;
+        op_state(op_state&&) = delete;
+        op_state& operator=(op_state const&) = delete;
+        op_state& operator=(op_state&&) = delete;
+
+        static void
+        on_resume(detail::frame_cb* p) noexcept
+        {
+            auto* self = static_cast<op_state*>(p->data);
+            self->complete();
+        }
+
+        static void
+        on_destroy(detail::frame_cb*) noexcept
+        {
+        }
+
+        void complete() noexcept
+        {
+            try
+            {
+                if constexpr (std::is_void_v<result_type>)
+                {
+                    aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                        stdexec::set_stopped(
+                            std::move(rcvr_));
+                    else
+                        stdexec::set_value(
+                            std::move(rcvr_));
+                }
+                else if constexpr (
+                    detail::is_compound_ec_result_v<result_type>)
+                {
+                    auto result = aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                    {
+                        stdexec::set_stopped(
+                            std::move(rcvr_));
+                    }
+                    else
+                    {
+                        auto ec = get<0>(result);
+                        if(!ec)
+                            stdexec::set_value(
+                                std::move(rcvr_),
+                                get<1>(std::move(result)));
+                        else
+                            stdexec::set_error(
+                                std::move(rcvr_), ec);
+                    }
+                }
+                else if constexpr (
+                    detail::is_ec_outcome_v<result_type>)
+                {
+                    auto result = aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                    {
+                        stdexec::set_stopped(
+                            std::move(rcvr_));
+                    }
+                    else
+                    {
+                        std::error_code ec;
+                        if constexpr (std::is_same_v<
+                            result_type, std::error_code>)
+                            ec = result;
+                        else
+                            ec = get<0>(result);
+                        if(!ec)
+                            stdexec::set_value(
+                                std::move(rcvr_));
+                        else
+                            stdexec::set_error(
+                                std::move(rcvr_), ec);
+                    }
+                }
+                else
+                {
+                    auto result = aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                        stdexec::set_stopped(
+                            std::move(rcvr_));
+                    else
+                        stdexec::set_value(
+                            std::move(rcvr_),
+                            std::move(result));
+                }
+            }
+            catch(...)
+            {
+                stdexec::set_error(
+                    std::move(rcvr_),
+                    std::current_exception());
+            }
+        }
+
+        void start() noexcept
+        {
+            auto renv = stdexec::get_env(rcvr_);
+            ex_ = detail::resolve_executor(renv);
+
+            std::stop_token st;
+            if constexpr (requires {
+                { renv.query(stdexec::get_stop_token_t{}) }
+                    -> std::convertible_to<std::stop_token>; })
+            {
+                st = renv.query(
+                    stdexec::get_stop_token_t{});
+            }
+
+            env_ = io_env{ex_, st, nullptr};
+
+            if(aw_.await_ready())
+            {
+                complete();
+                return;
+            }
+
+            cb_.resume = &on_resume;
+            cb_.destroy = &on_destroy;
+            cb_.data = this;
+
+            auto h = std::coroutine_handle<>::from_address(
+                static_cast<void*>(&cb_));
+
+            auto resumed = detail::call_await_suspend(
+                &aw_, h, &env_);
+            if(resumed == h)
+                complete();
+        }
+    };
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) &&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            std::move(aw_), std::move(rcvr));
+    }
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) const&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(aw_, std::move(rcvr));
+    }
+
+    // Bypass stdexec's sender_awaitable when co_awaited
+    // from a coroutine that provides get_io_executor or
+    // a start scheduler with get_io_executor. Adapts the
+    // IoAwaitable's 2-arg await_suspend to the standard
+    // 1-arg protocol.
+    template<class Promise>
+    auto as_awaitable(Promise& promise) &&
+    {
+        auto penv = promise.get_env();
+        auto ex = detail::resolve_executor(penv);
+
+        std::stop_token st;
+        if constexpr (requires {
+            { penv.query(stdexec::get_stop_token_t{}) }
+                -> std::convertible_to<std::stop_token>; })
+        {
+            st = penv.query(
+                stdexec::get_stop_token_t{});
+        }
+
+        using executor_type = decltype(ex);
+
+        struct aw
+        {
+            IoAw aw_;
+            executor_type ex_;
+            std::stop_token st_;
+            io_env env_;
+
+            bool await_ready() noexcept
+            {
+                return aw_.await_ready();
+            }
+
+            std::coroutine_handle<>
+            await_suspend(std::coroutine_handle<> h)
+            {
+                env_ = io_env{ex_, st_, nullptr};
+                return aw_.await_suspend(h, &env_);
+            }
+
+            auto await_resume()
+            {
+                return aw_.await_resume();
+            }
+        };
+
+        return aw{std::move(aw_), std::move(ex), st, {}};
+    }
+};
+
+/** Create a stdexec sender from an IoAwaitable.
+
+    The bridge routes the awaitable's result through sender
+    channels based on its type:
+
+    - `void` - calls `set_value()`.
+    - `error_code` (or a single-element tuple-like whose
+      element 0 is `error_code`) - calls `set_value()`
+      when the code is zero, `set_error(ec)` otherwise.
+    - Any other single value `T` - calls `set_value(T)`.
+    - Compound results whose element 0 is `error_code`
+      with additional elements are rejected at compile
+      time. Wrap the operation in a `task<error_code>`
+      that inspects the compound result and returns the
+      error code.
+
+    When connected or co_awaited, the bridge queries the
+    receiver's or promise's environment for a Capy executor
+    via get_io_executor. The environment must answer this
+    query with an object satisfying Capy's Executor concept.
+
+    @param aw The IoAwaitable to wrap.
+    @return A sender whose completion channels reflect
+        the awaitable's result type.
+*/
+template<class IoAw>
+auto as_sender(IoAw&& aw)
+{
+    return awaitable_sender<std::decay_t<IoAw>>{
+        std::forward<IoAw>(aw)};
+}
+
+// split_ec: sender adapter that routes error_code to
+// set_value() or set_error(ec) at runtime.
+
+namespace detail {
+
+template<class Sender>
+struct split_ec_sender
+{
+    using sender_concept = stdexec::sender_tag;
+
+    using completion_signatures =
+        stdexec::completion_signatures<
+            stdexec::set_value_t(),
+            stdexec::set_error_t(std::error_code),
+            stdexec::set_error_t(std::exception_ptr),
+            stdexec::set_stopped_t()>;
+
+    Sender sndr_;
+
+    template<class Receiver>
+    struct ec_receiver
+    {
+        using receiver_concept = stdexec::receiver_tag;
+
+        Receiver rcvr_;
+
+        auto get_env() const noexcept
+        {
+            return stdexec::get_env(rcvr_);
+        }
+
+        void set_value(std::error_code ec) && noexcept
+        {
+            if (!ec)
+                stdexec::set_value(
+                    std::move(rcvr_));
+            else
+                stdexec::set_error(
+                    std::move(rcvr_), ec);
+        }
+
+        void set_value() && noexcept
+        {
+            stdexec::set_value(
+                std::move(rcvr_));
+        }
+
+        template<class E>
+        void set_error(E&& e) && noexcept
+        {
+            stdexec::set_error(
+                std::move(rcvr_),
+                std::forward<E>(e));
+        }
+
+        void set_stopped() && noexcept
+        {
+            stdexec::set_stopped(
+                std::move(rcvr_));
+        }
+    };
+
+    template<class Receiver>
+    struct op_state
+    {
+        using operation_state_concept =
+            stdexec::operation_state_tag;
+
+        using inner_op_t = decltype(
+            stdexec::connect(
+                std::declval<Sender>(),
+                std::declval<ec_receiver<Receiver>>()));
+
+        inner_op_t op_;
+
+        op_state(Sender sndr, Receiver rcvr)
+            : op_(stdexec::connect(
+                std::move(sndr),
+                ec_receiver<Receiver>{std::move(rcvr)}))
+        {
+        }
+
+        op_state(op_state const&) = delete;
+        op_state(op_state&&) = delete;
+        op_state& operator=(op_state const&) = delete;
+        op_state& operator=(op_state&&) = delete;
+
+        void start() noexcept
+        {
+            stdexec::start(op_);
+        }
+    };
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) &&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            std::move(sndr_), std::move(rcvr));
+    }
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) const&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            sndr_, std::move(rcvr));
+    }
+};
+
+} // namespace detail
+
+/** Split an `error_code` value channel into success and error channels.
+
+    Takes a sender that completes with `set_value(error_code)` and
+    routes it at runtime: `set_value()` when the code is zero,
+    `set_error(ec)` otherwise. No exceptions.
+
+    @param sndr The predecessor sender.
+    @return A sender completing with `set_value()`,
+        `set_error(error_code)`, or `set_stopped()`.
+*/
+template<class Sender>
+auto split_ec(Sender&& sndr)
+{
+    return detail::split_ec_sender<
+        std::decay_t<Sender>>{
+            std::forward<Sender>(sndr)};
+}
+
+} // namespace boost::capy
+
+#endif
diff --git a/bench/stdexec/ioaw_io_read_stream.hpp b/bench/stdexec/ioaw_io_read_stream.hpp
new file mode 100644
index 000000000..eb18dc9bf
--- /dev/null
+++ b/bench/stdexec/ioaw_io_read_stream.hpp
@@ -0,0 +1,36 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_IOAW_IO_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_IOAW_IO_READ_STREAM_HPP
+
+#include "ioaw_read_stream.hpp"
+
+/// Abstract interface for IoAwaitable read streams.
+struct ioaw_io_read_stream
+{
+    virtual ioaw_read_stream::read_awaitable
+        read_some(boost::capy::mutable_buffer) = 0;
+    virtual ~ioaw_io_read_stream() = default;
+};
+
+/// Concrete implementation of ioaw_io_read_stream wrapping
+/// an ioaw_read_stream.
+struct ioaw_io_read_stream_impl : ioaw_io_read_stream
+{
+    ioaw_read_stream stream_;
+
+    ioaw_read_stream::read_awaitable
+        read_some(boost::capy::mutable_buffer buf) override
+    {
+        return stream_.read_some(buf);
+    }
+};
+
+#endif
diff --git a/bench/stdexec/ioaw_read_stream.hpp b/bench/stdexec/ioaw_read_stream.hpp
new file mode 100644
index 000000000..71c67f4b1
--- /dev/null
+++ b/bench/stdexec/ioaw_read_stream.hpp
@@ -0,0 +1,57 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_IOAW_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_IOAW_READ_STREAM_HPP
+
+#include <boost/capy/buffers.hpp>
+#include <boost/capy/concept/read_stream.hpp>
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+#include <coroutine>
+#include <cstddef>
+
+/// No-op ReadStream for benchmarking.
+///
+/// Uses the executor from io_env (passed by capy::task's
+/// transform_awaiter) to post the coroutine back. Satisfies
+/// ReadStream so it can be wrapped by capy::any_read_stream.
+struct ioaw_read_stream
+{
+    struct read_awaitable
+    {
+        boost::capy::continuation cont_{};
+
+        bool await_ready() const noexcept { return false; }
+
+        std::coroutine_handle<>
+        await_suspend(
+            std::coroutine_handle<> h,
+            boost::capy::io_env const* env)
+        {
+            cont_.h = h;
+            env->executor.post(cont_);
+            return std::noop_coroutine();
+        }
+
+        boost::capy::io_result<std::size_t>
+        await_resume() noexcept { return {{}, 0}; }
+    };
+
+    template <boost::capy::MutableBufferSequence MB>
+    read_awaitable read_some(MB)
+    {
+        return {};
+    }
+};
+
+static_assert(boost::capy::ReadStream<ioaw_read_stream>);
+
+#endif
diff --git a/bench/stdexec/ioaw_sync_read_stream.hpp b/bench/stdexec/ioaw_sync_read_stream.hpp
new file mode 100644
index 000000000..5daaac128
--- /dev/null
+++ b/bench/stdexec/ioaw_sync_read_stream.hpp
@@ -0,0 +1,59 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Synchronous-completion IoAwaitable stream.
+//
+// Every read completes immediately via symmetric
+// transfer — await_suspend returns the coroutine
+// handle, causing an inline resume with no scheduler
+// round-trip.
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_IOAW_SYNC_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_IOAW_SYNC_READ_STREAM_HPP
+
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <coroutine>
+#include <cstddef>
+
+struct ioaw_sync_read_stream
+{
+    struct read_awaitable
+    {
+        bool await_ready() const noexcept
+        {
+            return false;
+        }
+
+        std::coroutine_handle<>
+        await_suspend(
+            std::coroutine_handle<> h,
+            boost::capy::io_env const*)
+        {
+            // Data already buffered — resume inline
+            return h;
+        }
+
+        boost::capy::io_result<std::size_t>
+        await_resume() noexcept
+        {
+            return {{}, 0};
+        }
+    };
+
+    read_awaitable read_some(auto)
+    {
+        return {};
+    }
+};
+
+#endif
diff --git a/bench/stdexec/main.cpp b/bench/stdexec/main.cpp
new file mode 100644
index 000000000..c1edc93ad
--- /dev/null
+++ b/bench/stdexec/main.cpp
@@ -0,0 +1,815 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// I/O Read Stream Benchmark (stdexec edition)
+//
+// Compares three execution models across four stream
+// abstraction levels. 20M read_some calls per cell,
+// single thread.
+//
+// Table 1: sender pipeline   (connect/start)
+// Table 2: capy::task        (capy::thread_pool)
+// Table 3: exec::task        (exec::static_thread_pool)
+//
+// Each table has four rows:
+//   Native      - concrete stream, full visibility
+//   Abstract    - virtual dispatch, implementation hidden
+//   Type erased - value-type erasure (exec::any_sender)
+//   Synchronous - no scheduler trip
+//
+
+#include "allocation_tracker.hpp"
+#include "awaitable_sender.hpp"
+#include "ioaw_io_read_stream.hpp"
+#include "ioaw_read_stream.hpp"
+#include "ioaw_sync_read_stream.hpp"
+#include <exec/repeat_until.hpp>
+#include "sender_awaitable.hpp"
+#include "sender_io_env.hpp"
+#include "sndr_any_read_stream.hpp"
+#include "sndr_io_read_stream.hpp"
+#include "sndr_read_stream.hpp"
+#include "sndr_sync_read_stream.hpp"
+
+#include <boost/capy.hpp>
+#include <boost/capy/io/any_read_stream.hpp>
+
+#include <exec/function.hpp>
+#include <exec/static_thread_pool.hpp>
+#include <exec/task.hpp>
+#include <stdexec/execution.hpp>
+
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <cstdio>
+#include <latch>
+#include <memory>
+
+namespace capy = boost::capy;
+
+static counting_memory_resource g_counting_resource{
+    capy::get_recycling_memory_resource()};
+
+auto get_counting_resource() -> std::pmr::memory_resource*
+{
+    return &g_counting_resource;
+}
+
+struct cell_result
+{
+    long long ns = 0;
+    int64_t allocs = 0;
+};
+
+static constexpr int OPS_PER_CELL = 20'000'000;
+static constexpr int OUTER_LOOPS  = 2'000;
+static constexpr int INNER_LOOPS  = 10'000;
+
+static constexpr int NUM_RUNS    = 5;
+static constexpr int NUM_TABLES  = 3;
+static constexpr int NUM_STREAMS = 4;
+static constexpr int NUM_COLUMNS = 2;
+
+static constexpr int SENDER_RECEIVER = 0;
+static constexpr int CAPY_TASK       = 1;
+static constexpr int EXEC_TASK       = 2;
+
+static constexpr int NATIVE_STREAM      = 0;
+static constexpr int ABSTRACT_STREAM    = 1;
+static constexpr int TYPE_ERASED_STREAM = 2;
+static constexpr int SYNC_STREAM        = 3;
+
+static constexpr int NATIVE_EXEC_MODEL  = 0;
+static constexpr int BRIDGED_EXEC_MODEL = 1;
+
+// -----------------------------------------------------------
+// Table 2: capy::task - Column A (awaitable, native)
+// -----------------------------------------------------------
+
+template <class Stream>
+capy::task<> capy_session(Stream& stream)
+{
+    char buf[64];
+    for (int i = 0; i < INNER_LOOPS; ++i)
+        (void)co_await stream.read_some(
+            capy::mutable_buffer(buf, sizeof(buf)));
+}
+
+template <class Stream>
+capy::task<> capy_accept(Stream& stream, cell_result& out)
+{
+    auto before = g_alloc_count.load(std::memory_order_relaxed);
+    auto start = std::chrono::steady_clock::now();
+
+    for (int i = 0; i < OUTER_LOOPS; ++i)
+        co_await capy_session(stream);
+
+    auto elapsed = std::chrono::steady_clock::now() - start;
+    auto after = g_alloc_count.load(std::memory_order_relaxed);
+    out = {std::chrono::duration_cast<
+        std::chrono::nanoseconds>(elapsed).count(),
+        after - before};
+}
+
+// -----------------------------------------------------------
+// Table 2: capy::task - Column B (sender via await_sender)
+// -----------------------------------------------------------
+
+template <class Stream>
+capy::task<> capy_session_sndr(Stream& stream)
+{
+    char buf[64];
+    for (int i = 0; i < INNER_LOOPS; ++i)
+        (void)co_await capy::await_sender(
+            stream.read_some(
+                capy::mutable_buffer(buf, sizeof(buf))));
+}
+
+template <class Stream>
+capy::task<> capy_accept_sndr(Stream& stream, cell_result& out)
+{
+    auto before = g_alloc_count.load(std::memory_order_relaxed);
+    auto start = std::chrono::steady_clock::now();
+
+    for (int i = 0; i < OUTER_LOOPS; ++i)
+        co_await capy_session_sndr(stream);
+
+    auto elapsed = std::chrono::steady_clock::now() - start;
+    auto after = g_alloc_count.load(std::memory_order_relaxed);
+    out = {std::chrono::duration_cast<
+        std::chrono::nanoseconds>(elapsed).count(),
+        after - before};
+}
+
+// -----------------------------------------------------------
+// Table 3: exec::task - Column A (sender, native)
+// -----------------------------------------------------------
+
+template <class Stream>
+auto exec_session(Stream& stream) -> exec::task<void>
+{
+    char buf[64];
+    for (int i = 0; i < INNER_LOOPS; ++i)
+        (void)co_await stream.read_some(
+            capy::mutable_buffer(buf, sizeof(buf)));
+}
+
+template <class Stream>
+auto exec_accept(Stream& stream, cell_result& out)
+    -> exec::task<void>
+{
+    auto before = g_alloc_count.load(std::memory_order_relaxed);
+    auto start = std::chrono::steady_clock::now();
+
+    for (int i = 0; i < OUTER_LOOPS; ++i)
+        co_await exec_session(stream);
+
+    auto elapsed = std::chrono::steady_clock::now() - start;
+    auto after = g_alloc_count.load(std::memory_order_relaxed);
+    out = {std::chrono::duration_cast<
+        std::chrono::nanoseconds>(elapsed).count(),
+        after - before};
+}
+
+// -----------------------------------------------------------
+// Table 3: exec::task - Column B (awaitable via as_sender)
+//
+// exec::task's promise env carries only get_start_scheduler
+// (type-erased __any_scheduler), which does not propagate
+// custom queries like get_io_executor. We supply the
+// executor explicitly and inject it into each per-call env
+// via write_env so the awaitable_sender bridge can find it.
+// -----------------------------------------------------------
+
+template <class Stream>
+auto exec_session_ioaw(
+    Stream& stream,
+    sender_as_capy_executor ex) -> exec::task<void>
+{
+    char buf[64];
+    for (int i = 0; i < INNER_LOOPS; ++i)
+        (void)co_await stdexec::write_env(
+            capy::as_sender(
+                stream.read_some(
+                    capy::mutable_buffer(buf, sizeof(buf)))),
+            stdexec::prop{capy::get_io_executor, ex});
+}
+
+template <class Stream>
+auto exec_accept_ioaw(
+    Stream& stream,
+    sender_as_capy_executor ex,
+    cell_result& out) -> exec::task<void>
+{
+    auto before = g_alloc_count.load(std::memory_order_relaxed);
+    auto start = std::chrono::steady_clock::now();
+
+    for (int i = 0; i < OUTER_LOOPS; ++i)
+        co_await exec_session_ioaw(stream, ex);
+
+    auto elapsed = std::chrono::steady_clock::now() - start;
+    auto after = g_alloc_count.load(std::memory_order_relaxed);
+    out = {std::chrono::duration_cast<
+        std::chrono::nanoseconds>(elapsed).count(),
+        after - before};
+}
+
+int main()
+{
+    cell_result grid[NUM_RUNS + 1][NUM_TABLES][NUM_STREAMS][NUM_COLUMNS]{};
+
+    // run 0 is a warmup pass (results discarded);
+    // measured runs are 1..NUM_RUNS
+    for (int run = 0; run <= NUM_RUNS; ++run)
+    {
+
+    // -----------------------------------------------------------
+    // Table 1: sender/receiver pipeline (repeat_until)
+    //
+    // All Table 1 cells use exec::static_thread_pool(2) instead of (1).
+    // exec::repeat_until synchronously emplaces iteration N+1 inside
+    // iteration N's set_value cascade. With a single-worker pool the
+    // worker is stuck in that cascade and can't dispatch the post the
+    // cascade just queued, deadlocking. A second worker drains the
+    // queue while the first is in the cascade. Tables 2 and 3 stay at
+    // pool(1) because co_await suspension releases the worker between
+    // iterations and avoids the issue.
+    // -----------------------------------------------------------
+
+    // Col A: Sender (native)
+
+    // Native - sndr_read_stream
+    {
+        exec::static_thread_pool pool(2);
+        static_pool_context ctx;
+        sndr_read_stream stream{&pool};
+        pool_scheduler sched{&pool, &ctx};
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        stdexec::sync_wait(stdexec::starts_on(sched,
+            exec::repeat_until(
+                stdexec::let_value(stdexec::just(), [&]() {
+                    return stream.read_some(
+                        capy::mutable_buffer(buf, sizeof(buf)));
+                })
+                | stdexec::then([&count](std::size_t) { return --count == 0; }))));
+        pool.request_stop();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][NATIVE_STREAM][NATIVE_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Abstract - sndr_io_read_stream
+    {
+        exec::static_thread_pool pool(2);
+        static_pool_context ctx;
+        sndr_io_read_stream_impl stream{&pool};
+        pool_scheduler sched{&pool, &ctx};
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto* mr = get_counting_resource();
+        std::pmr::polymorphic_allocator<std::byte> alloc(mr);
+        auto before = g_alloc_count.load(std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        stdexec::sync_wait(
+            stdexec::write_env(
+                stdexec::starts_on(sched,
+                    exec::repeat_until(
+                        stdexec::let_value(stdexec::just(), [&]() {
+                            return static_cast<sndr_io_read_stream&>(
+                                stream).read_some(
+                                    capy::mutable_buffer(buf, sizeof(buf)));
+                        })
+                        | stdexec::then([&count](std::size_t) { return --count == 0; }))),
+                stdexec::prop{exec::get_frame_allocator, alloc}));
+        pool.request_stop();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][ABSTRACT_STREAM][NATIVE_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Type erased - sndr_any_read_stream
+    {
+        exec::static_thread_pool pool(2);
+        static_pool_context ctx;
+        sndr_any_read_stream stream(sndr_read_stream{&pool});
+        pool_scheduler sched{&pool, &ctx};
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto* mr = get_counting_resource();
+        std::pmr::polymorphic_allocator<std::byte> alloc(mr);
+        auto before = g_alloc_count.load(std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        stdexec::sync_wait(
+            stdexec::write_env(
+                stdexec::starts_on(sched,
+                    exec::repeat_until(
+                        stdexec::let_value(stdexec::just(), [&]() {
+                            return stream.read_some(
+                                capy::mutable_buffer(buf, sizeof(buf)));
+                        })
+                        | stdexec::then([&count](std::size_t) { return --count == 0; }))),
+                stdexec::prop{exec::get_frame_allocator, alloc}));
+        pool.request_stop();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][TYPE_ERASED_STREAM][NATIVE_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Synchronous - sndr_sync_read_stream (Col A)
+    {
+        exec::static_thread_pool pool(2);
+        static_pool_context ctx;
+        sndr_sync_read_stream stream;
+        pool_scheduler sched{&pool, &ctx};
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        stdexec::sync_wait(stdexec::starts_on(sched,
+            exec::repeat_until(
+                stdexec::let_value(stdexec::just(), [&]() {
+                    return stream.read_some(
+                        capy::mutable_buffer(buf, sizeof(buf)));
+                })
+                | stdexec::then([&count](std::size_t) { return --count == 0; }))));
+        pool.request_stop();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][SYNC_STREAM][NATIVE_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Col B: Awaitable (via as_sender bridge)
+
+    // Native - ioaw_read_stream
+    {
+        exec::static_thread_pool pool(2);
+        static_pool_context ctx;
+        ioaw_read_stream stream;
+        pool_scheduler sched{&pool, &ctx};
+        sender_as_capy_executor adapter{&pool, &ctx};
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        stdexec::sync_wait(stdexec::starts_on(sched,
+            exec::repeat_until(
+                stdexec::let_value(stdexec::just(), [&]() {
+                    return stdexec::write_env(
+                        capy::as_sender(stream.read_some(
+                            capy::mutable_buffer(buf, sizeof(buf)))),
+                        stdexec::prop{capy::get_io_executor, adapter});
+                })
+                | stdexec::then([&count](std::size_t) { return --count == 0; }))));
+        pool.request_stop();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][NATIVE_STREAM][BRIDGED_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Abstract - ioaw_io_read_stream
+    {
+        exec::static_thread_pool pool(2);
+        static_pool_context ctx;
+        ioaw_io_read_stream_impl stream;
+        pool_scheduler sched{&pool, &ctx};
+        sender_as_capy_executor adapter{&pool, &ctx};
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        stdexec::sync_wait(stdexec::starts_on(sched,
+            exec::repeat_until(
+                stdexec::let_value(stdexec::just(), [&]() {
+                    return stdexec::write_env(
+                        capy::as_sender(
+                            static_cast<ioaw_io_read_stream&>(
+                                stream).read_some(
+                                    capy::mutable_buffer(
+                                        buf, sizeof(buf)))),
+                        stdexec::prop{capy::get_io_executor, adapter});
+                })
+                | stdexec::then([&count](std::size_t) { return --count == 0; }))));
+        pool.request_stop();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][ABSTRACT_STREAM][BRIDGED_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Type erased - capy::any_read_stream
+    {
+        exec::static_thread_pool pool(2);
+        static_pool_context ctx;
+        ioaw_read_stream concrete;
+        capy::any_read_stream stream(&concrete);
+        pool_scheduler sched{&pool, &ctx};
+        sender_as_capy_executor adapter{&pool, &ctx};
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        stdexec::sync_wait(stdexec::starts_on(sched,
+            exec::repeat_until(
+                stdexec::let_value(stdexec::just(), [&]() {
+                    return stdexec::write_env(
+                        capy::as_sender(stream.read_some(
+                            capy::mutable_buffer(buf, sizeof(buf)))),
+                        stdexec::prop{capy::get_io_executor, adapter});
+                })
+                | stdexec::then([&count](std::size_t) { return --count == 0; }))));
+        pool.request_stop();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][TYPE_ERASED_STREAM][BRIDGED_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // Synchronous - ioaw_sync_read_stream (Col B)
+    {
+        exec::static_thread_pool pool(2);
+        static_pool_context ctx;
+        ioaw_sync_read_stream stream;
+        pool_scheduler sched{&pool, &ctx};
+        sender_as_capy_executor adapter{&pool, &ctx};
+        int count = OPS_PER_CELL;
+        char buf[64];
+        auto before = g_alloc_count.load(std::memory_order_relaxed);
+        auto start = std::chrono::steady_clock::now();
+        stdexec::sync_wait(stdexec::starts_on(sched,
+            exec::repeat_until(
+                stdexec::let_value(stdexec::just(), [&]() {
+                    return stdexec::write_env(
+                        capy::as_sender(stream.read_some(
+                            capy::mutable_buffer(buf, sizeof(buf)))),
+                        stdexec::prop{capy::get_io_executor, adapter});
+                })
+                | stdexec::then([&count](std::size_t) { return --count == 0; }))));
+        pool.request_stop();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        auto after = g_alloc_count.load(std::memory_order_relaxed);
+        grid[run][SENDER_RECEIVER][SYNC_STREAM][BRIDGED_EXEC_MODEL] = {
+            std::chrono::duration_cast<
+                std::chrono::nanoseconds>(elapsed).count(),
+            after - before};
+    }
+
+    // -----------------------------------------------------------
+    // Table 2: capy::task (capy::thread_pool)
+    // -----------------------------------------------------------
+
+    // Col A: Awaitable (native)
+
+    // Native - ioaw_read_stream
+    {
+        capy::thread_pool pool(1);
+        ioaw_read_stream stream;
+        capy::run_async(pool.get_executor())(
+            capy_accept(stream,
+                grid[run][CAPY_TASK][NATIVE_STREAM][NATIVE_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // Abstract - ioaw_io_read_stream
+    {
+        capy::thread_pool pool(1);
+        ioaw_io_read_stream_impl stream;
+        capy::run_async(pool.get_executor())(
+            capy_accept(static_cast<ioaw_io_read_stream&>(stream),
+                grid[run][CAPY_TASK][ABSTRACT_STREAM][NATIVE_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // Type erased - capy::any_read_stream
+    {
+        capy::thread_pool pool(1);
+        ioaw_read_stream concrete;
+        capy::any_read_stream stream(&concrete);
+        capy::run_async(pool.get_executor())(
+            capy_accept(stream,
+                grid[run][CAPY_TASK][TYPE_ERASED_STREAM][NATIVE_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // Synchronous - ioaw_sync_read_stream
+    {
+        capy::thread_pool pool(1);
+        ioaw_sync_read_stream stream;
+        capy::run_async(pool.get_executor())(
+            capy_accept(stream,
+                grid[run][CAPY_TASK][SYNC_STREAM][NATIVE_EXEC_MODEL]));
+        pool.join();
+    }
+
+    // Col B: Sender (via await_sender bridge)
+
+    // Native - sndr_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        sender_as_capy_executor adapter{&pool, &ctx};
+        sndr_read_stream stream{&pool};
+        std::latch done(1);
+        capy::run_async(adapter,
+            [&done](auto&&...) noexcept { done.count_down(); })(
+            capy_accept_sndr(stream,
+                grid[run][CAPY_TASK][NATIVE_STREAM][BRIDGED_EXEC_MODEL]));
+        done.wait();
+        pool.request_stop();
+    }
+
+    // Abstract - sndr_io_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        sender_as_capy_executor adapter{&pool, &ctx};
+        sndr_io_read_stream_impl stream{&pool};
+        std::latch done(1);
+        capy::run_async(adapter,
+            [&done](auto&&...) noexcept { done.count_down(); })(
+            capy_accept_sndr(
+                static_cast<sndr_io_read_stream&>(stream),
+                grid[run][CAPY_TASK][ABSTRACT_STREAM][BRIDGED_EXEC_MODEL]));
+        done.wait();
+        pool.request_stop();
+    }
+
+    // Type erased - sndr_any_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        sender_as_capy_executor adapter{&pool, &ctx};
+        sndr_any_read_stream stream(sndr_read_stream{&pool});
+        std::latch done(1);
+        capy::run_async(adapter,
+            [&done](auto&&...) noexcept { done.count_down(); })(
+            capy_accept_sndr(stream,
+                grid[run][CAPY_TASK][TYPE_ERASED_STREAM][BRIDGED_EXEC_MODEL]));
+        done.wait();
+        pool.request_stop();
+    }
+
+    // Synchronous - sndr_sync_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        sender_as_capy_executor adapter{&pool, &ctx};
+        sndr_sync_read_stream stream;
+        std::latch done(1);
+        capy::run_async(adapter,
+            [&done](auto&&...) noexcept { done.count_down(); })(
+            capy_accept_sndr(stream,
+                grid[run][CAPY_TASK][SYNC_STREAM][BRIDGED_EXEC_MODEL]));
+        done.wait();
+        pool.request_stop();
+    }
+
+    // -----------------------------------------------------------
+    // Table 3: exec::task (exec::static_thread_pool)
+    // -----------------------------------------------------------
+
+    // Col A: Sender (native)
+
+    // Native - sndr_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        pool_scheduler sched{&pool, &ctx};
+        sndr_read_stream stream{&pool};
+        stdexec::sync_wait(stdexec::starts_on(sched,
+            exec_accept(stream,
+                grid[run][EXEC_TASK][NATIVE_STREAM][NATIVE_EXEC_MODEL])));
+        pool.request_stop();
+    }
+
+    // Abstract - sndr_io_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        pool_scheduler sched{&pool, &ctx};
+        sndr_io_read_stream_impl stream{&pool};
+        auto* mr = get_counting_resource();
+        std::pmr::polymorphic_allocator<std::byte> alloc(mr);
+        stdexec::sync_wait(
+            stdexec::write_env(
+                stdexec::starts_on(sched,
+                    exec_accept(
+                        static_cast<sndr_io_read_stream&>(stream),
+                        grid[run][EXEC_TASK][ABSTRACT_STREAM][NATIVE_EXEC_MODEL])),
+                stdexec::prop{exec::get_frame_allocator, alloc}));
+        pool.request_stop();
+    }
+
+    // Type erased - sndr_any_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        pool_scheduler sched{&pool, &ctx};
+        sndr_any_read_stream stream(sndr_read_stream{&pool});
+        auto* mr = get_counting_resource();
+        std::pmr::polymorphic_allocator<std::byte> alloc(mr);
+        stdexec::sync_wait(
+            stdexec::write_env(
+                stdexec::starts_on(sched,
+                    exec_accept(stream,
+                        grid[run][EXEC_TASK][TYPE_ERASED_STREAM][NATIVE_EXEC_MODEL])),
+                stdexec::prop{exec::get_frame_allocator, alloc}));
+        pool.request_stop();
+    }
+
+    // Synchronous - sndr_sync_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        pool_scheduler sched{&pool, &ctx};
+        sndr_sync_read_stream stream;
+        stdexec::sync_wait(stdexec::starts_on(sched,
+            exec_accept(stream,
+                grid[run][EXEC_TASK][SYNC_STREAM][NATIVE_EXEC_MODEL])));
+        pool.request_stop();
+    }
+
+    // Col B: Awaitable (via as_sender bridge)
+
+    // Native - ioaw_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        pool_scheduler sched{&pool, &ctx};
+        sender_as_capy_executor adapter{&pool, &ctx};
+        ioaw_read_stream stream;
+        stdexec::sync_wait(stdexec::starts_on(sched,
+            exec_accept_ioaw(stream, adapter,
+                grid[run][EXEC_TASK][NATIVE_STREAM][BRIDGED_EXEC_MODEL])));
+        pool.request_stop();
+    }
+
+    // Abstract - ioaw_io_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        pool_scheduler sched{&pool, &ctx};
+        sender_as_capy_executor adapter{&pool, &ctx};
+        ioaw_io_read_stream_impl stream;
+        stdexec::sync_wait(stdexec::starts_on(sched,
+            exec_accept_ioaw(
+                static_cast<ioaw_io_read_stream&>(stream),
+                adapter,
+                grid[run][EXEC_TASK][ABSTRACT_STREAM][BRIDGED_EXEC_MODEL])));
+        pool.request_stop();
+    }
+
+    // Type erased - capy::any_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        pool_scheduler sched{&pool, &ctx};
+        sender_as_capy_executor adapter{&pool, &ctx};
+        ioaw_read_stream concrete;
+        capy::any_read_stream stream(&concrete);
+        auto* mr = get_counting_resource();
+        std::pmr::polymorphic_allocator<std::byte> alloc(mr);
+        stdexec::sync_wait(
+            stdexec::write_env(
+                stdexec::starts_on(sched,
+                    exec_accept_ioaw(stream, adapter,
+                        grid[run][EXEC_TASK][TYPE_ERASED_STREAM][BRIDGED_EXEC_MODEL])),
+                stdexec::prop{exec::get_frame_allocator, alloc}));
+        pool.request_stop();
+    }
+
+    // Synchronous - ioaw_sync_read_stream
+    {
+        exec::static_thread_pool pool(1);
+        static_pool_context ctx;
+        pool_scheduler sched{&pool, &ctx};
+        sender_as_capy_executor adapter{&pool, &ctx};
+        ioaw_sync_read_stream stream;
+        stdexec::sync_wait(stdexec::starts_on(sched,
+            exec_accept_ioaw(stream, adapter,
+                grid[run][EXEC_TASK][SYNC_STREAM][BRIDGED_EXEC_MODEL])));
+        pool.request_stop();
+    }
+
+    } // for (run)
+
+    // -----------------------------------------------------------
+    // Print results
+    // -----------------------------------------------------------
+
+    constexpr double ops = static_cast<double>(OPS_PER_CELL);
+
+    std::printf(
+        "I/O read stream benchmark (stdexec): "
+        "%d read_some calls per cell, %d runs\n",
+        OPS_PER_CELL, NUM_RUNS);
+
+    char const* row_labels[] = {
+        "Native", "Abstract", "Type-erased", "Synchronous"};
+
+    auto print_table = [&](
+        char const* title,
+        int table,
+        char const* col_a_label,
+        char const* col_b_label)
+    {
+        std::printf("\n  %s\n", title);
+        std::printf(
+            "  %-18s  %-30s  %-30s\n",
+            "", col_a_label, col_b_label);
+        std::printf(
+            "  %-18s  %-30s  %-30s\n",
+            "------------------",
+            "------------------------------",
+            "------------------------------");
+
+        for (int s = 0; s < NUM_STREAMS; ++s)
+        {
+            double sum[NUM_COLUMNS]{};
+            double sum2[NUM_COLUMNS]{};
+            double al[NUM_COLUMNS]{};
+            for (int c = 0; c < NUM_COLUMNS; ++c)
+            {
+                for (int r = 1; r <= NUM_RUNS; ++r)
+                {
+                    double v = static_cast<double>(
+                        grid[r][table][s][c].ns) / ops;
+                    sum[c] += v;
+                    sum2[c] += v * v;
+                    al[c] += static_cast<double>(
+                        grid[r][table][s][c].allocs);
+                }
+            }
+
+            double mean[NUM_COLUMNS];
+            double sd[NUM_COLUMNS];
+            double mean_al[NUM_COLUMNS];
+            for (int c = 0; c < NUM_COLUMNS; ++c)
+            {
+                mean[c] = sum[c] / NUM_RUNS;
+                double var = sum2[c] / NUM_RUNS -
+                    mean[c] * mean[c];
+                sd[c] = std::sqrt(var > 0 ? var : 0);
+                mean_al[c] = al[c] / (NUM_RUNS * ops);
+            }
+
+            std::printf(
+                "  %-18s"
+                "  %5.1f +/- %3.1f ns/op  %1.0f al/op"
+                "    %5.1f +/- %3.1f ns/op  %1.0f al/op"
+                "\n",
+                row_labels[s],
+                mean[0], sd[0], mean_al[0],
+                mean[1], sd[1], mean_al[1]);
+        }
+    };
+
+    print_table(
+        "sender/receiver pipeline",
+        SENDER_RECEIVER,
+        "A: sender (native)",
+        "B: awaitable (bridge)");
+
+    print_table(
+        "capy::task",
+        CAPY_TASK,
+        "A: awaitable (native)",
+        "B: sender (bridge)");
+
+    print_table(
+        "exec::task",
+        EXEC_TASK,
+        "A: sender (native)",
+        "B: awaitable (bridge)");
+
+    return 0;
+}
diff --git a/bench/stdexec/sender_awaitable.hpp b/bench/stdexec/sender_awaitable.hpp
new file mode 100644
index 000000000..045d2b893
--- /dev/null
+++ b/bench/stdexec/sender_awaitable.hpp
@@ -0,0 +1,428 @@
+//
+// Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_SENDER_AWAITABLE_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_SENDER_AWAITABLE_HPP
+
+#include <boost/capy/error.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <stdexec/execution.hpp>
+
+#include <atomic>
+#include <coroutine>
+#include <exception>
+#include <new>
+#include <stop_token>
+#include <system_error>
+#include <tuple>
+#include <type_traits>
+#include <variant>
+
+namespace boost::capy {
+
+namespace detail {
+
+struct stopped_t {};
+
+struct operation_cancelled {};
+
+struct bridge_env
+{
+    std::stop_token st_;
+
+    auto query(
+        stdexec::get_stop_token_t const&)
+            const noexcept
+    {
+        return st_;
+    }
+};
+
+template<class Sender>
+using sender_single_value_t =
+    stdexec::value_types_of_t<
+        Sender,
+        bridge_env,
+        std::tuple,
+        std::type_identity_t>;
+
+// Detect whether a sender can complete with
+// set_error(std::error_code).
+template<class Sender>
+struct has_error_code_completion
+{
+    template<class... Es>
+    struct checker
+    {
+        static constexpr bool value =
+            (std::is_same_v<
+                Es, std::error_code> || ...);
+    };
+
+    static constexpr bool value =
+        stdexec::error_types_of_t<
+            Sender,
+            bridge_env,
+            checker>::value;
+};
+
+template<class Sender>
+constexpr bool has_error_code_v =
+    has_error_code_completion<Sender>::value;
+
+// Variant when sender can complete with
+// set_error(error_code): separate slot so
+// error_code is not wrapped in exception_ptr.
+template<class ValueTuple>
+using ec_result_variant = std::variant<
+    std::monostate,
+    ValueTuple,
+    std::error_code,
+    std::exception_ptr,
+    stopped_t>;
+
+// Variant when sender does not complete with
+// set_error(error_code).
+template<class ValueTuple>
+using no_ec_result_variant = std::variant<
+    std::monostate,
+    ValueTuple,
+    std::exception_ptr,
+    stopped_t>;
+
+template<class ValueTuple, bool HasEc>
+using result_variant = std::conditional_t<
+    HasEc,
+    ec_result_variant<ValueTuple>,
+    no_ec_result_variant<ValueTuple>>;
+
+// Bridge receiver that stores the sender's
+// completion result and resumes the coroutine.
+// Uses an atomic flag shared with await_suspend
+// to handle synchronous completion safely:
+// whichever side (set_value or await_suspend)
+// arrives second is responsible for resumption.
+template<class ValueTuple, bool HasEc>
+struct bridge_receiver
+{
+    using receiver_concept =
+        stdexec::receiver_t;
+
+    result_variant<ValueTuple, HasEc>* result_;
+    std::coroutine_handle<>            cont_;
+    std::stop_token                    st_;
+    std::atomic<bool>*                 done_;
+
+    auto get_env() const noexcept -> bridge_env
+    {
+        return {st_};
+    }
+
+    void resume_if_ready() noexcept
+    {
+        if(done_->exchange(
+            true, std::memory_order_acq_rel))
+            cont_.resume();
+    }
+
+    template<class... Args>
+    void set_value(Args&&... args) && noexcept
+    {
+        result_->template emplace<1>(
+            std::forward<Args>(args)...);
+        resume_if_ready();
+    }
+
+    template<class E>
+    void set_error(E&& e) && noexcept
+    {
+        if constexpr (
+            HasEc &&
+            std::is_same_v<
+                std::decay_t<E>,
+                std::error_code>)
+            result_->template emplace<2>(
+                std::forward<E>(e));
+        else if constexpr (
+            std::is_same_v<
+                std::decay_t<E>,
+                std::exception_ptr>)
+        {
+            constexpr auto idx = HasEc ? 3 : 2;
+            result_->template emplace<idx>(
+                std::forward<E>(e));
+        }
+        else
+        {
+            constexpr auto idx = HasEc ? 3 : 2;
+            result_->template emplace<idx>(
+                std::make_exception_ptr(
+                    std::forward<E>(e)));
+        }
+        resume_if_ready();
+    }
+
+    void set_stopped() && noexcept
+    {
+        constexpr auto idx = HasEc ? 4 : 3;
+        result_->template emplace<idx>(
+            stopped_t{});
+        resume_if_ready();
+    }
+};
+
+} // namespace detail
+
+/** Awaitable that bridges a stdexec sender
+    into a Capy coroutine.
+
+    Satisfies IoAwaitable. When co_awaited inside
+    a capy::task, connects the sender to a bridge
+    receiver, starts the operation, and resumes
+    the coroutine when the sender completes.
+
+    Uses an atomic exchange protocol to handle
+    senders that complete synchronously during
+    start(): whichever side arrives second
+    (receiver or await_suspend) resumes the
+    coroutine.
+
+    The bridge inspects the sender's error
+    completion signatures at compile time. If the
+    sender can complete with
+    set_error(std::error_code), await_resume
+    returns io_result so the error code is a
+    value, not an exception. Otherwise
+    await_resume returns the value directly and
+    genuine exceptions are rethrown.
+
+    @tparam Sender The stdexec sender type.
+*/
+template<class Sender>
+struct [[nodiscard]] sender_awaitable
+{
+    static constexpr bool has_ec =
+        detail::has_error_code_v<Sender>;
+
+    using value_tuple =
+        detail::sender_single_value_t<Sender>;
+    using variant_type =
+        detail::result_variant<
+            value_tuple, has_ec>;
+    using receiver_type =
+        detail::bridge_receiver<
+            value_tuple, has_ec>;
+    using op_state_type = decltype(
+        stdexec::connect(
+            std::declval<Sender>(),
+            std::declval<receiver_type>()));
+
+    Sender sndr_;
+    variant_type result_{};
+
+    alignas(op_state_type)
+    unsigned char op_buf_[sizeof(op_state_type)];
+    bool op_constructed_ = false;
+    std::atomic<bool> done_{false};
+
+    explicit sender_awaitable(Sender sndr)
+        : sndr_(std::move(sndr))
+    {
+    }
+
+    sender_awaitable(sender_awaitable&& o)
+        noexcept(
+            std::is_nothrow_move_constructible_v<
+                Sender>)
+        : sndr_(std::move(o.sndr_))
+    {
+    }
+
+    sender_awaitable(
+        sender_awaitable const&) = delete;
+    sender_awaitable& operator=(
+        sender_awaitable const&) = delete;
+    sender_awaitable& operator=(
+        sender_awaitable&&) = delete;
+
+    ~sender_awaitable()
+    {
+        if(op_constructed_)
+            std::launder(
+                reinterpret_cast<op_state_type*>(
+                    op_buf_))->~op_state_type();
+    }
+
+    bool await_ready() const noexcept
+    {
+        return false;
+    }
+
+    std::coroutine_handle<>
+    await_suspend(
+        std::coroutine_handle<> h,
+        io_env const* env)
+    {
+        ::new(op_buf_) op_state_type(
+            stdexec::connect(
+                std::move(sndr_),
+                receiver_type{
+                    &result_, h,
+                    env->stop_token, &done_}));
+        op_constructed_ = true;
+        stdexec::start(
+            *std::launder(
+                reinterpret_cast<
+                    op_state_type*>(
+                        op_buf_)));
+
+        // If the sender completed during start(),
+        // the receiver already stored the result.
+        // Return h to resume without suspending.
+        if(done_.exchange(
+            true, std::memory_order_acq_rel))
+            return h;
+        return std::noop_coroutine();
+    }
+
+    auto await_resume()
+    {
+        if constexpr (has_ec)
+            return await_resume_ec();
+        else
+            return await_resume_no_ec();
+    }
+
+private:
+    // Sender can complete with
+    // set_error(error_code). Return io_result
+    // so the error code is a value, not an
+    // exception.
+    auto await_resume_ec()
+    {
+        // exception_ptr at index 3
+        if(result_.index() == 3)
+            std::rethrow_exception(
+                std::get<3>(result_));
+
+        if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 0)
+        {
+            // stopped at index 4
+            if(result_.index() == 4)
+                return io_result<>{
+                    make_error_code(
+                        error::canceled)};
+            if(result_.index() == 2)
+                return io_result<>{
+                    std::get<2>(result_)};
+            return io_result<>{};
+        }
+        else if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 1)
+        {
+            using T = std::tuple_element_t<
+                0, value_tuple>;
+            if(result_.index() == 4)
+                return io_result<T>{
+                    make_error_code(
+                        error::canceled), T{}};
+            if(result_.index() == 2)
+                return io_result<T>{
+                    std::get<2>(result_), T{}};
+            return io_result<T>{
+                {},
+                std::get<0>(
+                    std::get<1>(
+                        std::move(result_)))};
+        }
+        else
+        {
+            if(result_.index() == 4)
+                return io_result<value_tuple>{
+                    make_error_code(
+                        error::canceled), value_tuple{}};
+            if(result_.index() == 2)
+                return io_result<value_tuple>{
+                    std::get<2>(result_), value_tuple{}};
+            return io_result<value_tuple>{
+                {},
+                std::get<1>(
+                    std::move(result_))};
+        }
+    }
+
+    // Sender does not complete with
+    // set_error(error_code). Return the value
+    // directly; rethrow exceptions.
+    auto await_resume_no_ec()
+    {
+        // exception_ptr at index 2
+        if(result_.index() == 2)
+            std::rethrow_exception(
+                std::get<2>(result_));
+        // stopped at index 3
+        if(result_.index() == 3)
+            throw detail::operation_cancelled{};
+
+        if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 0)
+            return;
+        else if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 1)
+            return std::get<0>(
+                std::get<1>(
+                    std::move(result_)));
+        else
+            return std::get<1>(
+                std::move(result_));
+    }
+};
+
+/** Create an IoAwaitable from a stdexec sender.
+
+    If the sender can complete with
+    set_error(std::error_code), the returned
+    awaitable yields io_result so the error code
+    is a value, not an exception. Otherwise the
+    awaitable yields the value directly.
+
+    @par Example
+    @code
+    capy::task<int> compute(auto sched)
+    {
+        auto result = co_await await_sender(
+            stdexec::schedule(sched)
+                | stdexec::then(
+                    [] { return 42; }));
+        co_return result;
+    }
+    @endcode
+
+    @param sndr The sender to bridge.
+    @return An IoAwaitable that can be co_awaited
+        in a capy::task.
+*/
+template<class Sender>
+auto await_sender(Sender&& sndr)
+{
+    return sender_awaitable<
+        std::decay_t<Sender>>(
+            std::forward<Sender>(sndr));
+}
+
+} // namespace boost::capy
+
+#endif
diff --git a/bench/stdexec/sender_io_env.hpp b/bench/stdexec/sender_io_env.hpp
new file mode 100644
index 000000000..019edf8fe
--- /dev/null
+++ b/bench/stdexec/sender_io_env.hpp
@@ -0,0 +1,175 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// stdexec execution environment for benchmarks.
+//
+// Provides the capy executor adapter wrapping exec::static_thread_pool,
+// so capy::task can run on it.
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_SENDER_IO_ENV_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_SENDER_IO_ENV_HPP
+
+#include "awaitable_sender.hpp"
+
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/ex/execution_context.hpp>
+
+#include <stdexec/execution.hpp>
+#include <exec/static_thread_pool.hpp>
+#include <exec/start_detached.hpp>
+
+#include <coroutine>
+
+// Minimal execution_context shell for exec::static_thread_pool.
+// exec::static_thread_pool does not inherit from capy's
+// execution_context, but the Executor concept requires context()
+// to return one. This stub satisfies the requirement without
+// any service machinery.
+struct static_pool_context
+    : boost::capy::execution_context
+{
+    static_pool_context()
+        : boost::capy::execution_context(this)
+    {}
+
+    ~static_pool_context()
+    {
+        shutdown();
+        destroy();
+    }
+
+    static_pool_context(static_pool_context const&) = delete;
+    static_pool_context& operator=(static_pool_context const&) = delete;
+};
+
+// Adapter making exec::static_thread_pool satisfy capy's
+// Executor concept so capy::task can run on it.
+struct sender_as_capy_executor
+{
+    exec::static_thread_pool* pool_;
+    static_pool_context* ctx_;
+
+    boost::capy::execution_context& context() const noexcept
+    {
+        return *ctx_;
+    }
+
+    void on_work_started() const noexcept {}
+    void on_work_finished() const noexcept {}
+
+    void post(boost::capy::continuation& cont) const;
+
+    // Return the handle for symmetric transfer so the
+    // caller resumes the coroutine inline.
+    std::coroutine_handle<>
+    dispatch(boost::capy::continuation& c) const
+    {
+        return c.h;
+    }
+
+    bool operator==(
+        sender_as_capy_executor const&) const noexcept = default;
+};
+
+// Heap-allocated trampoline; not zero-alloc by design.
+// Honest reflection of what exec::static_thread_pool costs.
+inline void sender_as_capy_executor::post(
+    boost::capy::continuation& cont) const
+{
+    // upon_error eats the set_error_t(exception_ptr) channel
+    // that stdexec::then advertises so start_detached's
+    // no-error precondition is satisfied at the type level.
+    exec::start_detached(
+        stdexec::schedule(pool_->get_scheduler())
+        | stdexec::then([&cont]() noexcept { cont.h.resume(); })
+        | stdexec::upon_error([](auto&&) noexcept {}));
+}
+
+// Forward declaration needed so pool_schedule_sender can name pool_scheduler.
+struct pool_scheduler;
+
+// Custom schedule-sender for pool_scheduler. Wraps the pool's native
+// schedule sender but reports pool_scheduler as the completion scheduler,
+// which stdexec's starts_on adapter requires.
+struct pool_schedule_sender
+{
+    using sender_concept = stdexec::sender_tag;
+
+    exec::static_thread_pool* pool_;
+    pool_scheduler const* sched_;
+
+    // exec::static_thread_pool::scheduler completes with set_value_t()
+    // and set_stopped_t() (when the receiver carries a stop token).
+    template<class Self, class Env>
+    static consteval auto get_completion_signatures() noexcept
+    {
+        return stdexec::completion_signatures<
+            stdexec::set_value_t(),
+            stdexec::set_stopped_t()>{};
+    }
+
+    struct env_t
+    {
+        pool_scheduler const* sched_;
+
+        auto query(stdexec::get_completion_scheduler_t<
+                   stdexec::set_value_t> const&) const noexcept
+            -> pool_scheduler const&
+        {
+            return *sched_;
+        }
+    };
+
+    env_t get_env() const noexcept { return {sched_}; }
+
+    template<class Receiver>
+    auto connect(Receiver&& rcvr) &&
+    {
+        return stdexec::connect(
+            pool_->get_scheduler().schedule(),
+            std::forward<Receiver>(rcvr));
+    }
+
+    template<class Receiver>
+    auto connect(Receiver&& rcvr) const&
+    {
+        return stdexec::connect(
+            pool_->get_scheduler().schedule(),
+            std::forward<Receiver>(rcvr));
+    }
+};
+
+// Scheduler wrapper that delegates schedule() to exec::static_thread_pool
+// but answers boost::capy's get_io_executor_t query. Required by the
+// capy::as_sender bridge in awaitable_sender.hpp, which queries the
+// receiver-env scheduler for the capy executor at instantiation time.
+struct pool_scheduler
+{
+    using scheduler_concept = stdexec::scheduler_t;
+
+    exec::static_thread_pool* pool_;
+    static_pool_context* ctx_;
+
+    pool_schedule_sender schedule() const noexcept
+    {
+        return {pool_, this};
+    }
+
+    auto query(boost::capy::get_io_executor_t const&) const noexcept
+        -> sender_as_capy_executor
+    {
+        return sender_as_capy_executor{pool_, ctx_};
+    }
+
+    bool operator==(pool_scheduler const&) const = default;
+};
+
+#endif
diff --git a/bench/stdexec/sndr_any_read_stream.hpp b/bench/stdexec/sndr_any_read_stream.hpp
new file mode 100644
index 000000000..fd9b67d1e
--- /dev/null
+++ b/bench/stdexec/sndr_any_read_stream.hpp
@@ -0,0 +1,74 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_SNDR_ANY_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_SNDR_ANY_READ_STREAM_HPP
+
+#include <boost/capy/buffers.hpp>
+
+#include <exec/any_sender_of.hpp>
+#include <stdexec/execution.hpp>
+
+#include <cstddef>
+#include <memory>
+#include <system_error>
+#include <utility>
+
+/// Type-erased sender returned from any_read_stream::read_some.
+using any_read_sender =
+    exec::any_sender<exec::any_receiver<stdexec::completion_signatures<
+        stdexec::set_value_t(std::size_t),
+        stdexec::set_error_t(std::error_code),
+        stdexec::set_error_t(std::exception_ptr),
+        stdexec::set_stopped_t()>>>;
+
+/// Value-type-erased read stream.
+///
+/// Holds a concrete stream by value via a polymorphic
+/// model. read_some returns any_read_sender so the
+/// concrete stream's sender type is hidden at the
+/// stream's API boundary.
+class sndr_any_read_stream
+{
+    struct concept_t
+    {
+        virtual any_read_sender read_some(
+            boost::capy::mutable_buffer) = 0;
+        virtual ~concept_t() = default;
+    };
+
+    template <class Stream>
+    struct model_t : concept_t
+    {
+        Stream stream_;
+
+        explicit model_t(Stream s) : stream_(std::move(s)) {}
+
+        any_read_sender read_some(
+            boost::capy::mutable_buffer buf) override
+        {
+            return any_read_sender(stream_.read_some(buf));
+        }
+    };
+
+    std::unique_ptr<concept_t> impl_;
+
+public:
+    template <class Stream>
+    explicit sndr_any_read_stream(Stream s)
+        : impl_(new model_t<Stream>(std::move(s)))
+    {}
+
+    any_read_sender read_some(boost::capy::mutable_buffer buf)
+    {
+        return impl_->read_some(buf);
+    }
+};
+
+#endif
diff --git a/bench/stdexec/sndr_io_read_stream.hpp b/bench/stdexec/sndr_io_read_stream.hpp
new file mode 100644
index 000000000..71a502685
--- /dev/null
+++ b/bench/stdexec/sndr_io_read_stream.hpp
@@ -0,0 +1,42 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_SNDR_IO_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_SNDR_IO_READ_STREAM_HPP
+
+#include "sndr_any_read_stream.hpp"
+#include "sndr_read_stream.hpp"
+
+#include <boost/capy/buffers.hpp>
+
+/// Abstract interface for sender-based read streams.
+struct sndr_io_read_stream
+{
+    virtual any_read_sender
+        read_some(boost::capy::mutable_buffer) = 0;
+    virtual ~sndr_io_read_stream() = default;
+};
+
+/// Concrete implementation wrapping sndr_read_stream.
+struct sndr_io_read_stream_impl : sndr_io_read_stream
+{
+    sndr_read_stream stream_;
+
+    explicit sndr_io_read_stream_impl(
+        exec::static_thread_pool* pool)
+        : stream_{pool} {}
+
+    any_read_sender
+        read_some(boost::capy::mutable_buffer buf) override
+    {
+        return any_read_sender{stream_.read_some(buf)};
+    }
+};
+
+#endif
diff --git a/bench/stdexec/sndr_read_stream.hpp b/bench/stdexec/sndr_read_stream.hpp
new file mode 100644
index 000000000..52a7adb4f
--- /dev/null
+++ b/bench/stdexec/sndr_read_stream.hpp
@@ -0,0 +1,39 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_SNDR_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_SNDR_READ_STREAM_HPP
+
+#include <boost/capy/buffers.hpp>
+
+#include <exec/static_thread_pool.hpp>
+#include <stdexec/execution.hpp>
+
+#include <cstddef>
+
+/// No-op sender stream for benchmarks.
+///
+/// Holds an exec::static_thread_pool* (analogous to how a
+/// socket holds a reference to its execution context).
+/// read_some() returns starts_on(sched, just(0)); the
+/// sender is consumable by sender pipelines via connect
+/// and by exec::task / capy::task via co_await.
+struct sndr_read_stream
+{
+    exec::static_thread_pool* pool_;
+
+    auto read_some(boost::capy::mutable_buffer)
+    {
+        return stdexec::starts_on(
+            pool_->get_scheduler(),
+            stdexec::just(std::size_t{0}));
+    }
+};
+
+#endif
diff --git a/bench/stdexec/sndr_sync_read_stream.hpp b/bench/stdexec/sndr_sync_read_stream.hpp
new file mode 100644
index 000000000..95b5246a7
--- /dev/null
+++ b/bench/stdexec/sndr_sync_read_stream.hpp
@@ -0,0 +1,31 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_SNDR_SYNC_READ_STREAM_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_SNDR_SYNC_READ_STREAM_HPP
+
+#include <boost/capy/buffers.hpp>
+
+#include <stdexec/execution.hpp>
+
+#include <cstddef>
+
+/// Synchronous no-op sender stream.
+///
+/// read_some returns just(0); no scheduler trip.
+/// Used as the synchronous-baseline row in the bench.
+struct sndr_sync_read_stream
+{
+    auto read_some(boost::capy::mutable_buffer)
+    {
+        return stdexec::just(std::size_t{0});
+    }
+};
+
+#endif
diff --git a/build/Jamfile b/build/Jamfile
index 53809611c..fe25a8cdb 100644
--- a/build/Jamfile
+++ b/build/Jamfile
@@ -23,6 +23,10 @@ project boost/capy
   : common-requirements
     <link>shared:<define>BOOST_CAPY_DYN_LINK
     <link>static:<define>BOOST_CAPY_STATIC_LINK
+    <target-os>darwin:<cxxflags>-fexperimental-library
+    <target-os>darwin:<linkflags>-fexperimental-library
+    <target-os>freebsd:<cxxflags>-fexperimental-library
+    <target-os>freebsd:<linkflags>-fexperimental-library
   : usage-requirements
     <define>BOOST_CAPY_NO_LIB
   : source-location ..
diff --git a/cmake/boost_capy-config.cmake.in b/cmake/boost_capy-config.cmake.in
new file mode 100644
index 000000000..e095a4df1
--- /dev/null
+++ b/cmake/boost_capy-config.cmake.in
@@ -0,0 +1,4 @@
+@PACKAGE_INIT@
+include("${CMAKE_CURRENT_LIST_DIR}/boost_capy-targets.cmake")
+include("${CMAKE_CURRENT_LIST_DIR}/DiscoverTests.cmake")
+check_required_components(boost_capy)
diff --git a/doc/as_sender.md b/doc/as_sender.md
new file mode 100644
index 000000000..85859da02
--- /dev/null
+++ b/doc/as_sender.md
@@ -0,0 +1,170 @@
+# Zero-Allocation Sender-to-Awaitable Bridge
+
+Every IoAwaitable ever written - timers, mutexes, channels, semaphores,
+file I/O, sockets, database queries, HTTP clients - is now consumable
+by a sender pipeline. No coroutine frame. No heap allocation. Zero cost.
+
+## What This Is
+
+`as_sender` wraps any IoAwaitable in a P2300-compliant sender. A
+receiver attaches to the sender through `connect`. When `start` is
+called, the operation state drives the awaitable protocol directly -
+`await_ready`, `await_suspend`, `await_resume` - without ever creating
+a coroutine.
+
+The awaitable does not know it is talking to a sender. It sees a
+`coroutine_handle<>` and an `io_env*`, exactly as it would from a
+coroutine. The awaitable's code does not change. Not one line.
+
+```cpp
+// Wrap any IoAwaitable as a sender
+auto sndr = as_sender(stream.read_some(buf));
+
+// Attach a receiver and start the operation
+auto op = connect(std::move(sndr), my_receiver);
+start(op);
+```
+
+## How It Works
+
+The bridge rests on a single observation: all three major compilers
+(MSVC, GCC, Clang) lay out a coroutine frame with two function
+pointers at the front:
+
+```cpp
+struct coroutine_frame {
+    void (*resume)(coroutine_frame*);
+    void (*destroy)(coroutine_frame*);
+    // ... promise, locals, state ...
+};
+```
+
+When you call `handle.resume()`, the compiler calls the function
+pointer at offset zero. That is all it does.
+
+The bridge defines a lightweight struct that matches this layout:
+
+```cpp
+struct frame_cb {
+    void (*resume)(frame_cb*);
+    void (*destroy)(frame_cb*);
+    void* data;
+};
+```
+
+Three pointers. Twenty-four bytes on a 64-bit platform. The `resume`
+pointer holds the sender's completion callback. The `destroy` pointer
+is a no-op - the sender owns its own lifetime. The `data` pointer
+points back to the operation state.
+
+`std::coroutine_handle<>::from_address(&cb)` produces a handle that,
+when `.resume()` is called, invokes the function pointer at offset
+zero - which is our callback. The awaitable receives this handle. It
+cannot tell the difference. It does not need to.
+
+## The Flow
+
+Here is what happens, step by step:
+
+- **`as_sender(awaitable)`** stores the awaitable inside a sender.
+  Nothing runs yet. Senders are lazy.
+
+- **`connect(sender, receiver)`** produces an operation state. The
+  operation state holds the awaitable, the receiver, an `io_env`, and
+  a `frame_cb`. Everything lives on the operation state. No allocation.
+
+- **`start(op_state)`** begins the operation:
+
+  1. The executor and stop token are pulled from the receiver's
+     environment and stored in the `io_env`.
+
+  2. `await_ready()` is checked. If the awaitable is immediately
+     ready, the result is harvested and the receiver is signaled
+     inline.
+
+  3. Otherwise, the `frame_cb` is filled in: `resume` points to the
+     completion callback, `destroy` is a no-op, `data` points to the
+     operation state. A `coroutine_handle<>` is manufactured from the
+     `frame_cb`'s address. `await_suspend(handle, &env)` is called on
+     the awaitable.
+
+- **The awaitable runs.** It submits work to the reactor - a timer
+  fires, bytes arrive on a socket, a mutex unlocks. When the operation
+  completes, the reactor calls `executor.post(handle)` or
+  `executor.dispatch(handle)`.
+
+- **The executor calls `handle.resume()`.** Because the handle points
+  at the `frame_cb`, this calls the `resume` function pointer. The
+  callback recovers the operation state from `data`, calls
+  `await_resume()` to harvest the result, and signals the receiver
+  through `set_value`, `set_error`, or `set_stopped`.
+
+The awaitable went through its entire lifecycle - ready check,
+suspension, reactor submission, executor resumption, result harvest -
+without a coroutine ever existing.
+
+## What This Means
+
+The awaitable ecosystem and the sender ecosystem are no longer
+separate worlds. They are one world.
+
+Every IoAwaitable anyone has written becomes a sender with a single
+function call. Awaitable authors gain a new consumer base without
+modifying a single line of their code. Sender authors gain access to
+every I/O primitive the awaitable ecosystem has produced - and will
+produce - at zero allocation cost.
+
+- **One I/O implementation.** The library implements each operation
+  once as an IoAwaitable. Coroutines `co_await` it. Sender pipelines
+  consume it through `as_sender`. Both go through the same reactor,
+  the same executor, the same platform code.
+
+- **Zero allocation.** The `frame_cb` lives on the operation state.
+  No coroutine frame. No heap allocation. No bridge coroutine. The
+  previous implementation allocated a coroutine frame per I/O
+  operation just to obtain a `coroutine_handle<>`. That tax is gone.
+
+- **Full protocol fidelity.** The bridge respects `await_ready` for
+  synchronous fast-paths. It normalizes `await_suspend` return types
+  (`void`, `bool`, `coroutine_handle<>`). It propagates the executor
+  and stop token through `io_env`. It routes results to `set_value`,
+  errors to `set_error`, and cancellation to `set_stopped`.
+
+- **Transparent to the awaitable.** The awaitable sees a
+  `coroutine_handle<>` and an `io_env const*`. It does not know
+  whether the handle points at a coroutine frame or a `frame_cb`. It
+  does not need to know. The handle is the abstraction boundary, and
+  the abstraction holds.
+
+- **Works today.** This is not a proposal. It is shipping code. It
+  compiles and passes tests on MSVC, GCC, and Clang. The ABI
+  compatibility that makes it work is the same ABI reality documented
+  in P3203R0 and relied upon by Boost.Cobalt in production.
+
+## Example
+
+```cpp
+namespace capy = boost::capy;
+namespace ex = beman::execution;
+
+// A Capy IoAwaitable - a 500ms timer
+auto sndr = capy::as_sender(capy::delay(500ms));
+
+// Connect a receiver whose environment carries a Capy executor
+auto op = ex::connect(
+    std::move(sndr),
+    my_receiver{
+        {pool.get_executor(), stop_source.get_token()},
+        &done});
+
+// Start the operation - no coroutine frame allocated
+ex::start(op);
+```
+
+The receiver's environment provides the executor and stop token. The
+bridge threads them into the `io_env` that the awaitable expects. The
+timer fires, the executor resumes the handle, the receiver gets
+`set_value()`. Twenty-four bytes of `frame_cb` on the operation state.
+That is the entire cost.
+
+Welcome to the awaitable universe. The door is open.
diff --git a/doc/buffer-rationale.md b/doc/buffer-rationale.md
new file mode 100644
index 000000000..ff3753fad
--- /dev/null
+++ b/doc/buffer-rationale.md
@@ -0,0 +1,635 @@
+# Design Rationale: Buffer Representation and Ownership
+
+## Context
+
+This document captures the design space and trade-offs for the buffer
+subsystem in capy. The central question is how to represent memory
+regions for asynchronous I/O in a way that is zero-copy, composable
+with C++20 concepts, and safe across coroutine suspension points. The
+analysis applies to four interrelated design decisions:
+
+1. How to represent individual buffer regions and buffer sequences.
+2. How to customize buffer operations (size, slicing) without virtual
+   dispatch.
+3. How to manage resizable buffers (DynamicBuffer) with correct
+   lifetime semantics in coroutine-based APIs.
+4. How to model the two buffer ownership patterns (caller-owns vs.
+   callee-owns) for asynchronous data transfer.
+
+The design was shaped by 25 years of Asio practice, the constraints
+of C++20 coroutines, and the goal of supporting both POSIX scatter/gather
+I/O and layered protocol streams.
+
+## Current Design
+
+### Primitive Types
+
+Two non-owning reference types form the foundation:
+
+```cpp
+class mutable_buffer
+{
+    unsigned char* p_ = nullptr;
+    std::size_t n_ = 0;
+public:
+    constexpr mutable_buffer(void* data, std::size_t size) noexcept;
+    constexpr void* data() const noexcept;
+    constexpr std::size_t size() const noexcept;
+    mutable_buffer& operator+=(std::size_t n) noexcept;
+};
+
+class const_buffer
+{
+    unsigned char const* p_ = nullptr;
+    std::size_t n_ = 0;
+public:
+    constexpr const_buffer(void const* data, std::size_t size) noexcept;
+    constexpr const_buffer(mutable_buffer const& b) noexcept;
+    // ...
+};
+```
+
+`mutable_buffer` is implicitly convertible to `const_buffer`. Both
+support `operator+=` for advancing the start position without
+allocation.
+
+### Buffer Sequence Concepts
+
+```cpp
+template<typename T>
+concept ConstBufferSequence =
+    std::is_convertible_v<T, const_buffer> || (
+        std::ranges::bidirectional_range<T> &&
+        std::is_convertible_v<std::ranges::range_value_t<T>, const_buffer>);
+
+template<typename T>
+concept MutableBufferSequence =
+    std::is_convertible_v<T, mutable_buffer> || (
+        std::ranges::bidirectional_range<T> &&
+        std::is_convertible_v<std::ranges::range_value_t<T>, mutable_buffer>);
+```
+
+A single buffer satisfies the sequence concept (one-element range via
+pointer arithmetic on `begin`/`end`). This eliminates the need for
+callers to distinguish between single buffers and multi-buffer
+sequences.
+
+### Customization via tag_invoke
+
+Buffer operations (`buffer_size`, slicing) are customized through
+`tag_invoke` with dedicated tag types (`size_tag`, `slice_tag`). Types
+that provide a `tag_invoke` overload for `slice_tag` are sliced
+in-place; types that do not are wrapped in `slice_of<T>`.
+
+### DynamicBuffer with Coroutine Safety
+
+```cpp
+template<class T>
+concept DynamicBuffer = /* prepare/commit/consume interface */;
+
+template<class B>
+concept DynamicBufferParam =
+    DynamicBuffer<std::remove_cvref_t<B>> &&
+    (std::is_lvalue_reference_v<B> ||
+     requires { typename std::remove_cvref_t<B>::is_dynamic_buffer_adapter; });
+```
+
+`DynamicBufferParam` restricts rvalue passing to adapter types that
+reference external storage. Value types that store bookkeeping
+internally are rejected as rvalues, preventing silent data loss when
+a coroutine suspends.
+
+### Buffer Ownership Models
+
+Two concepts model asynchronous data transfer:
+
+- **BufferSource** (pull model): Callee produces data, caller consumes.
+  `pull()` returns buffer descriptors; `consume(n)` advances.
+
+- **BufferSink** (callee-owns-buffers): Callee provides writable memory,
+  caller writes into it. `prepare()` returns writable buffers;
+  `commit(n)` finalizes; `commit_eof(n)` signals end-of-stream.
+
+## Background
+
+### The Scatter/Gather I/O Model
+
+POSIX `readv` and `writev` accept arrays of `iovec` structures, each
+describing a contiguous memory region. This scatter/gather model avoids
+the cost of assembling a contiguous buffer before a system call. The
+buffer sequence concept is the C++ generalization of this model: any
+type that produces a range of `(pointer, size)` pairs can participate
+in I/O without copying data into an intermediate buffer.
+
+### The Asio Precedent
+
+Boost.Asio established the buffer sequence model that capy inherits.
+Asio's `const_buffer`, `mutable_buffer`, `ConstBufferSequence`, and
+`MutableBufferSequence` concepts have been stable across 20+ years of
+production use. The capy design preserves the conceptual model while
+modernizing the mechanism:
+
+- Asio uses `buffer_sequence_begin` / `buffer_sequence_end` free
+  functions and SFINAE-based traits. Capy uses C++20 concepts and
+  `std::ranges`.
+- Asio's `dynamic_string_buffer` and `dynamic_vector_buffer` accept
+  references. Capy adds the `DynamicBufferParam` concept to enforce
+  lifetime safety at compile time.
+- Asio has no equivalent of the `BufferSource` / `BufferSink` concepts,
+  which capy introduces for structured data transfer pipelines.
+
+### Coroutine Suspension and Buffer Lifetimes
+
+When a coroutine suspends, its local variables live in the coroutine
+frame. Parameters passed by reference may dangle if the caller's scope
+exits before resumption:
+
+```cpp
+// WRONG: buffers may dangle across co_await
+task<> read_some(MutableBufferSequence auto const& buffers);
+
+// CORRECT: buffers copied into coroutine frame
+task<> read_some(MutableBufferSequence auto buffers);
+```
+
+This constraint propagates through the design. `buffer_param` accepts
+a `const&` in its constructor because the outer template function has
+already captured the buffer sequence by value in the coroutine frame.
+`DynamicBufferParam` enforces the rule at the concept level:
+value types must be passed as lvalues (the caller retains ownership),
+while adapter types may be passed as rvalues (the external storage
+persists).
+
+## The Buffer Representation Question
+
+### Option R1: Non-Owning Pointer-Size Pair
+
+Use two lightweight types (`const_buffer`, `mutable_buffer`) that store
+a pointer and a size. No ownership, no allocation. The caller manages
+the underlying memory.
+
+**Arguments for:**
+
+1. **Zero overhead.** A buffer descriptor is two machine words. Copying,
+   comparing, and advancing are trivial operations.
+2. **Matches the OS model.** `iovec` is `{void*, size_t}`. The buffer
+   types are a direct, typesafe mapping.
+3. **Composable.** Single buffers satisfy the buffer sequence concept.
+   Multi-buffer containers (`buffer_array`, `buffer_pair`,
+   `std::array<mutable_buffer, N>`) compose naturally.
+4. **25 years of production stability.** Asio's identical representation
+   has survived without modification.
+
+**Arguments against:**
+
+1. **No lifetime tracking.** The caller must ensure the referenced
+   memory outlives the buffer descriptor. In coroutine contexts this
+   requires discipline (pass by value, not reference).
+2. **No capacity.** Unlike `span`, a buffer does not carry a "max size"
+   distinct from "current size." Resizable behavior requires a
+   separate DynamicBuffer wrapper.
+
+### Option R2: Owning Buffer with Embedded Storage
+
+Provide a buffer type that owns its memory, similar to `std::vector<char>`.
+
+**Arguments for:**
+
+1. **Lifetime safety by construction.** No dangling references.
+2. **Simpler mental model** for users unfamiliar with non-owning types.
+
+**Arguments against:**
+
+1. **Allocation cost.** Every buffer construction allocates. I/O
+   operations that should be zero-copy now copy into owned storage.
+2. **Incompatible with scatter/gather.** The OS provides memory (e.g.,
+   kernel buffers, memory-mapped regions); wrapping it in an owning
+   type requires copying.
+3. **Breaks the composition model.** A `read_some` that returns an
+   owning buffer cannot write into caller-provided memory.
+4. **No precedent.** No major I/O library (Asio, libuv, io_uring, Windows
+   IOCP) uses owning buffers at the primitive level.
+
+### Option R3: span-Based Representation
+
+Use `std::span<std::byte>` and `std::span<std::byte const>` directly.
+
+**Arguments for:**
+
+1. **Standard vocabulary type.** Users already know `span`.
+2. **Const-correctness through the type system.** `span<byte const>`
+   is read-only; `span<byte>` is writable.
+
+**Arguments against:**
+
+1. **Type pollution.** `span<byte>` is not implicitly convertible to
+   `span<byte const>` through the same mechanism as `mutable_buffer`
+   to `const_buffer`. The generic code that accepts both must use
+   additional template machinery.
+2. **No customization points.** `span` does not support `tag_invoke`
+   for size or slicing without wrapping.
+3. **Element type mismatch.** `span<byte>` requires callers to cast
+   from `void*` or `char*`. The buffer types accept `void*` directly,
+   which matches the POSIX and Asio conventions.
+4. **No `operator+=`.** Advancing a `span` requires constructing a
+   new subspan. The buffer types support in-place advance, which is
+   the dominant operation in I/O loops.
+
+**Recommendation:** Option R1. The pointer-size pair is the minimal
+representation that maps to the OS model, composes with scatter/gather
+I/O, and has decades of production stability.
+
+## The Customization Mechanism Question
+
+Buffer operations need customization: `buffer_size` should be O(1) for
+types that track total size, and slicing should be in-place for types
+that support it. The question is how to dispatch to type-specific
+implementations.
+
+### Option C1: tag_invoke
+
+Provide tag types (`size_tag`, `slice_tag`) and dispatch through ADL
+`tag_invoke`. Types that provide an overload get the optimized path;
+the default falls back to iteration.
+
+**Arguments for:**
+
+1. **Non-intrusive.** Third-party types can opt in without modifying
+   their class definition.
+2. **Composable.** The same mechanism handles `buffer_array`,
+   `buffer_pair`, `slice_of`, and user-defined types uniformly.
+3. **No virtual dispatch.** The call resolves at compile time.
+4. **Established pattern.** `tag_invoke` is the customization mechanism
+   used throughout the P2300 ecosystem.
+
+**Arguments against:**
+
+1. **Unfamiliar syntax.** `tag_invoke(slice_tag{}, bs, how, n)` is
+   harder to read than `bs.slice(how, n)`.
+2. **Discoverability.** Users cannot rely on IDE autocompletion to find
+   available customization points.
+
+### Option C2: Virtual Member Functions
+
+Use a base class with virtual `size()` and `slice()` methods.
+
+**Arguments for:**
+
+1. **Familiar OOP pattern.** Users understand virtual dispatch.
+2. **Discoverable.** IDE completion shows available methods.
+
+**Arguments against:**
+
+1. **Allocation and indirection.** Virtual dispatch requires a vtable
+   pointer. Buffer descriptors are two machine words; adding a vtable
+   pointer increases their size by 50%.
+2. **Incompatible with value semantics.** Buffers are copied freely in
+   I/O loops. Polymorphic types require heap allocation or slicing
+   protection.
+3. **Closed hierarchy.** New buffer types must inherit from the base
+   class, which forecloses types that cannot be modified.
+
+### Option C3: Concept-Based Overloading
+
+Overload free functions on concept constraints without `tag_invoke`.
+
+**Arguments for:**
+
+1. **Simpler.** No tag types needed.
+2. **C++20 native.** Concept-constrained overloads are the standard
+   mechanism.
+
+**Arguments against:**
+
+1. **Ambiguity.** Without tags, two overloads for "size" on different
+   concepts may conflict. `tag_invoke` scopes the customization point
+   to the tag type, preventing collision.
+2. **No fallback dispatch.** The default `buffer_size` iterates over
+   the sequence and sums individual sizes. With `tag_invoke`, the
+   default path and the optimized path coexist naturally; with concept
+   overloading, the mechanism for selecting "use the optimized version
+   if available, otherwise iterate" requires additional SFINAE.
+
+**Recommendation:** Option C1. `tag_invoke` provides non-intrusive,
+composable customization with compile-time dispatch. The syntax cost
+is paid by library implementers, not users, since the free functions
+(`buffer_size`, `keep_prefix`, `remove_prefix`, etc.) hide the
+dispatch.
+
+## The Dynamic Buffer Lifetime Question
+
+Dynamic buffers support resizable I/O targets (the `prepare` /
+`commit` / `consume` protocol). The question is how to enforce correct
+passing in coroutine APIs.
+
+### Option L1: Unconstrained Forwarding Reference
+
+Accept `DynamicBuffer auto&&` in coroutine functions.
+
+**Arguments for:**
+
+1. **Simplest signature.** No additional concept needed.
+
+**Arguments against:**
+
+1. **Silent data loss.** A value type like `flat_dynamic_buffer` passed
+   as an rvalue is moved into the coroutine frame. Its bookkeeping
+   (size, position) is local to the frame. When the coroutine completes,
+   the caller's original buffer is unchanged - the committed data is
+   silently discarded.
+
+### Option L2: Lvalue Reference Only
+
+Accept `DynamicBuffer auto&` in coroutine functions.
+
+**Arguments for:**
+
+1. **Correct for value types.** The caller retains ownership and
+   observes mutations.
+
+**Arguments against:**
+
+1. **Rejects valid adapters.** `string_dynamic_buffer` wraps an
+   external `std::string*`. Passing it as an rvalue is safe because
+   the external string retains the data. Requiring an lvalue forces
+   the caller to name every temporary adapter, adding friction:
+
+   ```cpp
+   // Rejected, but safe:
+   co_await read(stream, string_dynamic_buffer(&s));
+
+   // Required workaround:
+   auto buf = string_dynamic_buffer(&s);
+   co_await read(stream, buf);
+   ```
+
+### Option L3: DynamicBufferParam Concept
+
+Introduce a second concept that allows lvalues of any `DynamicBuffer`
+and rvalues only for types that define
+`using is_dynamic_buffer_adapter = void`:
+
+```cpp
+template<class B>
+concept DynamicBufferParam =
+    DynamicBuffer<std::remove_cvref_t<B>> &&
+    (std::is_lvalue_reference_v<B> ||
+     requires { typename std::remove_cvref_t<B>::is_dynamic_buffer_adapter; });
+```
+
+Coroutine functions use `DynamicBufferParam auto&&`.
+
+**Arguments for:**
+
+1. **Compile-time safety.** Value types passed as rvalues are rejected.
+   Adapter types passed as rvalues are accepted. The correct passing
+   convention is enforced, not documented.
+2. **Zero runtime cost.** The check is entirely in the type system.
+3. **Preserves ergonomics.** `co_await read(stream, dynamic_buffer(s))`
+   works because the factory returns an adapter type.
+
+**Arguments against:**
+
+1. **Requires opt-in tag.** Every adapter type must define
+   `is_dynamic_buffer_adapter`. Forgetting the tag causes a compile
+   error, which is the safe failure mode but adds a requirement for
+   implementers.
+2. **Two concepts for one abstraction.** Users must learn when to use
+   `DynamicBuffer` (non-coroutine, lvalue ref) vs `DynamicBufferParam`
+   (coroutine, forwarding ref).
+
+**Recommendation:** Option L3. The compile-time enforcement eliminates
+a class of silent data-loss bugs that are difficult to diagnose at
+runtime. The cost (an extra tag typedef and a second concept) is paid
+by library authors, not users.
+
+## The Buffer Ownership Question
+
+Asynchronous data transfer between a producer and a consumer requires
+a decision about who provides the memory. Two models exist.
+
+### Option O1: Caller-Owns Buffers (WriteSink / ReadStream)
+
+The caller provides buffers; the I/O operation reads from or writes
+into them:
+
+```cpp
+auto [ec, n] = co_await stream.write_some(caller_buffers);
+```
+
+**Arguments for:**
+
+1. **Caller controls allocation.** Stack buffers, pooled buffers, and
+   memory-mapped regions are all usable without adaptation.
+2. **Natural for stream I/O.** `read_some` / `write_some` have always
+   worked this way.
+3. **No internal buffering.** The data path is caller -> kernel, with
+   no intermediate copy.
+
+**Arguments against:**
+
+1. **Caller must manage buffer lifetime.** The buffers must remain
+   valid until the I/O completes (coroutine resumes).
+2. **Does not support zero-copy callee-initiated transfers.** If the
+   sink has internal storage (compression buffer, TLS record buffer),
+   copying from caller buffers into internal storage is unavoidable.
+
+### Option O2: Callee-Owns Buffers (BufferSink)
+
+The sink provides writable memory; the caller writes directly into it:
+
+```cpp
+auto dst_bufs = sink.prepare(dst_arr);
+std::size_t n = buffer_copy(dst_bufs, src_bufs);
+auto [ec] = co_await sink.commit(n);
+```
+
+**Arguments for:**
+
+1. **Zero-copy into internal storage.** The caller writes directly into
+   the sink's compression buffer, TLS record buffer, or kernel buffer.
+   No intermediate copy.
+2. **Sink controls memory layout.** The sink can align buffers, size
+   them for protocol framing, or provide buffers from a pool.
+3. **Enables back-pressure.** An empty `prepare()` return signals that
+   the sink has no available space; the caller must wait for `commit`
+   to flush.
+
+**Arguments against:**
+
+1. **Sink must provide storage.** If the sink is a raw socket, it must
+   either maintain an internal buffer or delegate to the kernel. For
+   simple streams this is unnecessary overhead.
+2. **More complex protocol.** Three operations (`prepare`, write,
+   `commit`) vs. one (`write_some`).
+
+### Option O3: Both Models, Separate Concepts
+
+Provide both `WriteSink` (caller-owns) and `BufferSink` (callee-owns)
+as distinct concepts. Similarly, provide both `ReadStream`
+(caller-owns) and `BufferSource` (callee-owns) for producers.
+
+**Arguments for:**
+
+1. **Each model fits its natural domain.** Stream I/O uses
+   caller-owns (the Asio model). Layered protocols and compression
+   use callee-owns. Neither model subsumes the other.
+2. **No forced adaptation.** A raw socket implements `WriteStream`
+   directly. A TLS layer implements `BufferSink` directly. Neither
+   must pretend to be the other.
+3. **Transfer algorithms compose the two.** A generic `transfer`
+   function can connect a `BufferSource` to a `BufferSink`, or a
+   `BufferSource` to a `WriteStream`, choosing the ownership model
+   that minimizes copies for each pairing.
+
+**Arguments against:**
+
+1. **Two concepts where one might suffice.** Users must learn both
+   models and understand which to use.
+2. **Adapter proliferation.** Converting between models requires
+   adapter types.
+
+**Recommendation:** Option O3. The two ownership models serve
+different domains and neither subsumes the other. Providing both as
+first-class concepts enables the library to minimize copies at each
+layer boundary.
+
+## The Windowed Access Question
+
+Buffer sequences may contain many elements. Passing them through
+virtual function boundaries or to system calls that accept a limited
+number of `iovec` structures requires batching.
+
+### Option W1: Flatten to Contiguous Buffer
+
+Copy all data into a single contiguous buffer before the system call.
+
+**Arguments for:**
+
+1. **Simplest code.** A single buffer needs no batching logic.
+
+**Arguments against:**
+
+1. **Allocation and copy cost.** For large transfers this is
+   prohibitive.
+2. **Defeats scatter/gather.** The entire point of buffer sequences
+   is to avoid this copy.
+
+### Option W2: buffer_param Windowed Wrapper
+
+Wrap the buffer sequence in `buffer_param`, which maintains a sliding
+window of up to `max_iovec` buffer descriptors. `data()` returns the
+current window as a `span`; `consume(n)` advances:
+
+```cpp
+task<> write(ConstBufferSequence auto buffers)
+{
+    buffer_param bp(buffers);
+    while(true)
+    {
+        auto bufs = bp.data();
+        if(bufs.empty())
+            break;
+        auto n = co_await do_write(bufs);
+        bp.consume(n);
+    }
+}
+```
+
+**Arguments for:**
+
+1. **Zero allocation.** The window is a fixed-size array in the
+   `buffer_param` object.
+2. **Natural batch size.** The window size matches the OS limit for
+   scatter/gather I/O (`IOV_MAX`).
+3. **Enables virtual dispatch.** The template captures the buffer
+   sequence type; the virtual function receives `std::span<const_buffer>`.
+   This bridges templates and virtual functions without type erasure.
+4. **Empty buffers are skipped.** The window contains only non-empty
+   buffers, which is a requirement of most OS scatter/gather APIs.
+
+**Arguments against:**
+
+1. **Fixed window size.** If the OS supports more `iovec` entries than
+   `max_iovec`, the window is unnecessarily small. In practice,
+   `IOV_MAX` is 1024 on Linux and `max_iovec` is tuned accordingly.
+
+**Recommendation:** Option W2. The windowed wrapper eliminates
+allocation, matches the OS batch size, and enables the template-to-
+virtual-function bridge that layered protocol implementations require.
+
+## Areas of Agreement
+
+1. **Buffers are non-owning reference types.** The primitive buffer
+   types describe memory; they do not own it. Ownership is the
+   caller's responsibility, managed through stack allocation, dynamic
+   buffers, or external containers.
+
+2. **Single buffers satisfy the sequence concept.** Requiring callers
+   to wrap a single buffer in an array or span adds friction with no
+   corresponding benefit. The `begin`/`end` overloads that return
+   pointers to a single buffer eliminate this friction.
+
+3. **Coroutine APIs must accept buffer sequences by value.** Reference
+   parameters dangle across suspension points. This is a hard
+   constraint of C++20 coroutines, not a design preference.
+
+4. **DynamicBuffer lifetime enforcement belongs in the type system.**
+   A compile-time error for `flat_dynamic_buffer` passed as an rvalue
+   to a coroutine is strictly better than silent data loss at runtime.
+
+5. **Both buffer ownership models are necessary.** Caller-owns is
+   natural for stream I/O. Callee-owns is natural for layered
+   protocols. Neither subsumes the other.
+
+6. **Customization should be non-intrusive.** Third-party buffer types
+   must be able to opt into optimized `buffer_size` and slicing without
+   modifying their class definitions.
+
+## Areas of Disagreement
+
+1. **Whether `tag_invoke` is the right customization mechanism.** The
+   P2300 ecosystem uses `tag_invoke` extensively, but WG21 has moved
+   toward `tag_invoke`'s successor proposals. The design could be
+   updated to use a newer mechanism without changing the conceptual
+   model.
+
+2. **Whether two concepts (`DynamicBuffer` and `DynamicBufferParam`)
+   are acceptable complexity.** One view holds that the compile-time
+   safety justifies the additional concept. The other holds that a
+   single concept with clear documentation is sufficient, and that
+   the adapter tag is an implementation detail that leaks into the
+   concept definition.
+
+3. **Whether `BufferSink` should use synchronous or asynchronous
+   `prepare`.** The current design makes `prepare` synchronous (it
+   returns a span immediately) and `commit` asynchronous. An
+   alternative makes both asynchronous, allowing the sink to wait for
+   internal buffer space. The synchronous design was chosen because
+   `prepare` is a memory operation (provide a pointer), not an I/O
+   operation, and back-pressure is signaled by returning an empty span
+   rather than by suspending.
+
+## Summary
+
+| Decision                | Chosen Design               | Alternative                     | Rationale                                              |
+| ----------------------- | --------------------------- | ------------------------------- | ------------------------------------------------------ |
+| Buffer representation   | Non-owning pointer-size     | Owning buffer, span             | Zero overhead, matches OS model, 25 years of stability |
+| Buffer sequence concept | Convertible-or-range        | Iterator pair, span only        | Single buffers compose naturally                       |
+| Customization           | tag_invoke with tag types   | Virtual dispatch, concept overload | Non-intrusive, composable, no runtime cost          |
+| DynamicBuffer lifetime  | DynamicBufferParam concept  | Unconstrained, lvalue-only      | Compile-time enforcement of coroutine safety           |
+| Buffer ownership        | Both caller-owns and callee-owns | Single model              | Neither subsumes the other                             |
+| Windowed access         | buffer_param sliding window | Flatten to contiguous           | Zero allocation, matches OS batch size                 |
+| Slicing                 | In-place via tag_invoke + slice_of fallback | Always copy  | Types that track size can slice in O(1)                |
+
+The buffer subsystem is designed around a single principle: buffers
+describe memory, they do not own it. This principle propagates through
+every layer - from the two-word primitive types, through the sequence
+concepts that treat single buffers and ranges uniformly, to the
+DynamicBuffer adapters that reference external storage, to the
+BufferSource and BufferSink concepts that structure data transfer
+without dictating who provides the memory. The coroutine lifetime
+constraint (pass by value) and the DynamicBufferParam concept are
+consequences of this principle: when memory is not owned by the buffer
+descriptor, lifetime must be managed explicitly, and the type system
+should enforce correct management at compile time.
diff --git a/doc/buffers-asio.md b/doc/buffers-asio.md
new file mode 100644
index 000000000..7e01f7679
--- /dev/null
+++ b/doc/buffers-asio.md
@@ -0,0 +1,250 @@
+# Buffer Sequence Theory
+
+This document explains Asio's buffer sequence abstraction - what it is, what rules govern it, and how users extend it with their own types.
+
+## The Buffer Primitive
+
+A buffer is a pointer and a size. It describes a contiguous region of memory without owning it.
+
+Asio defines two buffer types:
+
+- `mutable_buffer` - writable memory (`void*` + `size_t`)
+- `const_buffer` - read-only memory (`const void*` + `size_t`)
+
+Both expose two member functions: `data()` returns the pointer, `size()` returns the byte count.
+
+`mutable_buffer` is implicitly convertible to `const_buffer` (writable memory can always be read). The reverse conversion is disallowed - you cannot write to read-only memory.
+
+The pointer type is `void*`, not `std::byte*`. This is deliberate. POSIX uses `void*` in its I/O structures (`iovec`) for semantic neutrality - raw I/O should not opine on what the memory contains. The buffer types preserve this neutrality.
+
+These types are non-owning descriptors. They reference memory but do not manage its lifetime. Creating a buffer from a pointer does not allocate, copy, or extend the life of anything. The caller is responsible for ensuring the memory remains valid while the buffer is in use.
+
+## Why Sequences
+
+Operating systems support scatter-gather I/O. A gather-write (`writev` on POSIX, scatter/gather with IOCP on Windows) transmits multiple buffers in a single syscall. A scatter-read (`readv`) receives data into multiple buffers at once.
+
+This is important for performance. Consider sending an HTTP response: the status line is in one buffer, each header in another, the body in yet another. Without scatter-gather, you must copy everything into a single contiguous allocation before writing. With scatter-gather, you pass all the buffers to one syscall and the kernel handles the rest.
+
+A buffer sequence is the abstraction that represents this collection of buffers. It is the C++ type that maps to the array of `iovec` structures that the OS expects.
+
+## The Abstraction
+
+A buffer sequence is any type that produces a bidirectional iteration of buffers.
+
+More precisely: a type `T` is a buffer sequence if the free functions `buffer_sequence_begin(t)` and `buffer_sequence_end(t)` return bidirectional iterators whose value type is convertible to `const_buffer` (for read operations) or `mutable_buffer` (for write-into operations).
+
+### Customization Points
+
+`buffer_sequence_begin` and `buffer_sequence_end` are free functions that serve as customization points. For standard containers, Asio provides default overloads that call `begin()` and `end()`. For user-defined types, the user provides overloads found via ADL (argument-dependent lookup).
+
+This is the same customization pattern used throughout Asio. The type's namespace determines which overload is found. Wrapping a buffer sequence in a type-erasing container (like stuffing it into a lambda or a `std::function`) destroys the type information that ADL needs, breaking the mechanism.
+
+### Why Bidirectional
+
+The iterators must be at least bidirectional - not merely forward. Two reasons:
+
+1. Algorithms that consume buffer sequences sometimes need to traverse backwards. When removing a prefix from a buffer sequence (consuming bytes from the front after a partial read), the implementation may need to adjust the first unconsumed buffer.
+
+2. A read or write operation fills or drains buffers in order, front to back. If the operation is interrupted partway through a buffer, the implementation needs to locate that buffer and adjust its starting position for the next call. Bidirectional iteration simplifies this bookkeeping.
+
+Forward-only ranges do not satisfy the buffer sequence requirements.
+
+### The Single-Buffer Case
+
+A lone `const_buffer` or `mutable_buffer` is itself a valid buffer sequence - a sequence of exactly one element. Asio provides overloads of `buffer_sequence_begin` and `buffer_sequence_end` that return a pointer to the buffer and a pointer one past it, respectively. This makes a single buffer act like a one-element array.
+
+This unification matters: any function that accepts a buffer sequence also accepts a single buffer. There is no need for separate overloads.
+
+```cpp
+template<ConstBufferSequence Buffers>
+void send(const Buffers& buffers);
+
+const_buffer single = ...;
+send(single);                              // one buffer
+
+std::array<const_buffer, 3> multiple = ...;
+send(multiple);                            // three buffers
+```
+
+Both calls use the same function template. The concept is satisfied in both cases.
+
+## The Formal Rules
+
+A type `X` satisfies `ConstBufferSequence` if:
+
+- `X` is `Destructible` and `CopyConstructible`
+- `buffer_sequence_begin(x)` and `buffer_sequence_end(x)` return bidirectional iterators whose value type is convertible to `const_buffer`
+- After copy construction `X u(x)`, the sequence of buffers in `u` is identical to the sequence in `x` - each corresponding buffer has the same `data()` pointer and the same `size()`
+
+A type `X` satisfies `MutableBufferSequence` if the same rules hold with `mutable_buffer` in place of `const_buffer`.
+
+Every `MutableBufferSequence` is automatically a `ConstBufferSequence`, because `mutable_buffer` converts to `const_buffer`. A function that accepts `ConstBufferSequence` will accept mutable buffer sequences without any additional work.
+
+### The Copy Postcondition
+
+The third rule deserves emphasis. After copying a buffer sequence, the copy must describe the exact same memory regions as the original. Same pointers. Same sizes. The copy is shallow - it duplicates the descriptors, not the bytes they point at.
+
+This means a buffer sequence cannot own the memory it describes. If a type held an internal `std::string` and yielded a `const_buffer` pointing at that string's data, copying the type would copy the string to a new address. The copy's `data()` pointers would differ from the original's, violating the postcondition. Buffer sequences must reference externally-owned memory.
+
+## What Already Satisfies the Requirements
+
+Any standard bidirectional container of buffers works:
+
+```cpp
+std::array<const_buffer, 4> bufs;    // fixed-size, stack-allocated
+std::vector<mutable_buffer> bufs;    // dynamic
+std::list<const_buffer> bufs;        // linked, bidirectional
+```
+
+These types are `CopyConstructible`, their `begin()`/`end()` return bidirectional iterators, and their value types convert to the appropriate buffer type. Asio's default overloads of `buffer_sequence_begin`/`buffer_sequence_end` delegate to the container's own iterators.
+
+A single `const_buffer` or `mutable_buffer` also satisfies the requirements, as described above.
+
+A `std::forward_list<const_buffer>` does not qualify - its iterators are forward-only, not bidirectional.
+
+## Writing Your Own Buffer Sequence
+
+There are two ways to make a user-defined type satisfy the buffer sequence requirements.
+
+### Provide begin() and end() Members
+
+If your type behaves like a container - it has `begin()` and `end()` member functions returning bidirectional iterators over buffers - then Asio's default `buffer_sequence_begin`/`buffer_sequence_end` overloads will find them automatically:
+
+```cpp
+class header_buffers
+{
+    const_buffer bufs_[3];
+
+public:
+    header_buffers(
+        const_buffer status_line,
+        const_buffer headers,
+        const_buffer separator)
+        : bufs_{status_line, headers, separator}
+    {
+    }
+
+    const const_buffer* begin() const { return bufs_; }
+    const const_buffer* end() const { return bufs_ + 3; }
+};
+```
+
+This type is `CopyConstructible` (the default copy copies the array of descriptors, preserving `data()` pointers and sizes). Its `begin()`/`end()` return pointers, which are random-access iterators (and therefore bidirectional). It satisfies `ConstBufferSequence`.
+
+### Provide ADL Overloads
+
+For types where `begin()`/`end()` members are not appropriate, provide free function overloads of `buffer_sequence_begin` and `buffer_sequence_end` in the same namespace as the type:
+
+```cpp
+namespace app {
+
+class composite_buffers
+{
+    const_buffer bufs_[2];
+
+public:
+    composite_buffers(const_buffer head, const_buffer body)
+        : bufs_{head, body}
+    {
+    }
+
+    friend const const_buffer*
+    buffer_sequence_begin(const composite_buffers& b)
+    {
+        return b.bufs_;
+    }
+
+    friend const const_buffer*
+    buffer_sequence_end(const composite_buffers& b)
+    {
+        return b.bufs_ + 2;
+    }
+};
+
+} // namespace app
+```
+
+ADL finds the friend functions when Asio calls `buffer_sequence_begin(x)` with an `app::composite_buffers` argument.
+
+### A More Interesting Example
+
+The real power of user-defined buffer sequences is lazy composition. Consider a type that concatenates two buffer sequences without allocating:
+
+```cpp
+template<class BS1, class BS2>
+class buffers_cat
+{
+    BS1 bs1_;
+    BS2 bs2_;
+
+public:
+    class const_iterator
+    {
+        // Bidirectional iterator that walks bs1_ first, then bs2_.
+        // When it reaches the end of bs1_, it transitions to
+        // the beginning of bs2_. Decrementing from the beginning
+        // of bs2_ transitions back to the end of bs1_.
+        // ...
+    };
+
+    buffers_cat(BS1 bs1, BS2 bs2)
+        : bs1_(std::move(bs1))
+        , bs2_(std::move(bs2))
+    {
+    }
+
+    const_iterator begin() const;
+    const_iterator end() const;
+};
+```
+
+Iterating this type yields all buffers from the first sequence followed by all buffers from the second. No allocation occurs - the composed sequence is a view over the two sub-sequences. The resulting type satisfies `ConstBufferSequence` (assuming both sub-sequences do), and it can be passed directly to `async_write`.
+
+This is the composition that concrete types like `span<span<byte>>` cannot provide without allocation.
+
+## Ownership and Lifetime
+
+Buffer sequences have a two-layer ownership model. The buffer sequence object (the descriptor) and the underlying memory (the bytes it points at) follow separate rules.
+
+### The Implementation Copies the Sequence
+
+When an asynchronous read or write operation is initiated, the implementation stores a copy of the buffer sequence inside its composed operation state. The Asio specification states:
+
+> If a read or write operation is also an asynchronous operation, the operation shall maintain one or more copies of the buffer sequence until such time as the operation no longer requires access to the memory specified by the buffers in the sequence.
+
+This is why `CopyConstructible` is a requirement. It is not an abstract nicety - the implementation literally copies the buffer sequence object into its internal state so it can re-use it across the multiple `async_read_some` or `async_write_some` calls that compose the full operation.
+
+### The Caller Owns the Memory
+
+The implementation copies the buffer sequence object, but it never copies the underlying bytes. The Asio documentation for `async_read` and `async_write` states:
+
+> Although the buffers object may be copied as necessary, ownership of the underlying memory blocks is retained by the caller, which must guarantee that they remain valid until the completion handler is called.
+
+More precisely, the memory must remain valid until:
+
+- the last copy of the buffer sequence is destroyed, or
+- the completion handler is invoked,
+
+whichever comes first.
+
+### What This Means in Practice
+
+The buffer sequence is a view. It describes memory it does not own. The implementation copies the view. The caller owns the memory the view points at.
+
+A common mistake: passing a buffer that references a local variable to an asynchronous operation, then returning from the function before the operation completes. The local variable is destroyed, the buffer's `data()` pointer dangles, and the operation reads or writes garbage.
+
+```cpp
+void bad_example(tcp::socket& sock)
+{
+    char buf[1024];
+    // buf is on the stack - it will be destroyed when
+    // this function returns, but the async operation
+    // has not completed yet
+    async_read(sock, mutable_buffer(buf, sizeof(buf)),
+        [](error_code ec, std::size_t n) { /* ... */ });
+}
+```
+
+The buffer sequence (a single `mutable_buffer`) is copied into the async operation's state - that copy is fine. But the memory at `buf` ceases to exist when `bad_example` returns. The operation proceeds to write into a destroyed stack frame.
+
+The fix is to ensure the memory outlives the operation - allocate on the heap, use a member variable, or tie the buffer's lifetime to the completion handler via a shared pointer or similar mechanism.
diff --git a/doc/buffers-capy.md b/doc/buffers-capy.md
new file mode 100644
index 000000000..14a5aae53
--- /dev/null
+++ b/doc/buffers-capy.md
@@ -0,0 +1,310 @@
+# Boost.Capy Buffer System - Technical Report
+
+## 1. General Principle
+
+Capy's buffer model descends directly from Boost.Asio's Networking TS design (N4771). The central insight: **I/O buffers are not byte ranges - they are memory region descriptors**. A single buffer is a `(void*, size_t)` pair that describes a contiguous memory region without owning it and without making semantic claims about its contents (unlike `std::span<std::byte>`). A *buffer sequence* is a bidirectional range of such descriptors, enabling scatter/gather I/O to map directly onto OS primitives like POSIX `writev`/`readv`.
+
+The design differs from raw ranges in three critical ways:
+
+- **`buffer_size` vs `ranges::size`** - `ranges::size` on `array<const_buffer, 3>` returns 3 (count of descriptors). `buffer_size` returns the sum of all bytes across all descriptors. I/O code needs total bytes, not descriptor count.
+- **Element shrinking** - Range algorithms drop whole elements. Buffer algorithms shrink individual elements (advance a pointer, reduce a size) to model partial consumption.
+- **Zero-allocation composition** - Concrete types like `span<span<byte>>` require allocation to concatenate. Buffer sequences compose at compile time through concept-constrained templates.
+
+---
+
+## 2. Foundation Types
+
+### `mutable_buffer`
+
+A non-owning reference to a writable memory region.
+
+- **Internal state**: `unsigned char* p_` and `std::size_t n_`
+- **Construction**: from `(void*, size_t)` - stores the pointer as `unsigned char*`
+- **API**: `data()` returns `void*`, `size()` returns byte count
+- **`operator+=`**: advances `p_` forward by `n` bytes (clamped to `n_`), shrinking the region - the fundamental "consume from front" operation
+- **Slice CPO**: `tag_invoke(slice_tag, mutable_buffer&, slice_how, size_t)` dispatches to `remove_prefix` (advance) or `keep_prefix` (truncate)
+
+### `const_buffer`
+
+A non-owning reference to a read-only memory region. Structurally identical to `mutable_buffer` but stores `unsigned char const*`.
+
+- **Implicit conversion from `mutable_buffer`**: enables any function accepting `ConstBufferSequence` to work with mutable buffers
+- Same `operator+=` and `tag_invoke(slice_tag)` semantics
+
+### Key design choice: `void*` not `std::byte*`
+
+The types use `void*`/`void const*` in their public API (`data()` returns), matching POSIX `iovec` semantics. This makes no semantic claim about buffer contents - the memory could hold characters, integers, protocol frames, or raw binary. `std::byte` would impose the opinion that the contents are "bytes" and support bitwise operations, which is not always the right abstraction for I/O.
+
+---
+
+## 3. Buffer Sequence Concepts
+
+### `ConstBufferSequence`
+
+```cpp
+template<typename T>
+concept ConstBufferSequence =
+    std::is_convertible_v<T, const_buffer> || (
+        std::ranges::bidirectional_range<T> &&
+        std::is_convertible_v<std::ranges::range_value_t<T>, const_buffer>);
+```
+
+Two satisfaction paths:
+
+1. **Single buffer**: the type itself converts to `const_buffer` (e.g., `const_buffer`, `mutable_buffer`)
+2. **Range of buffers**: a bidirectional range whose elements convert to `const_buffer` (e.g., `std::array<const_buffer, N>`, `std::vector<const_buffer>`)
+
+### `MutableBufferSequence`
+
+Same structure, but elements must convert to `mutable_buffer`. Every `MutableBufferSequence` is also a `ConstBufferSequence` (because `mutable_buffer` converts to `const_buffer`).
+
+### Uniform iteration: `begin()` / `end()` CPOs
+
+Customization point objects that handle both cases uniformly:
+
+- **Single buffer** (convertible to `const_buffer`): returns `&b` / `&b + 1`, treating it as a one-element sequence
+- **Range**: delegates to `std::ranges::begin` / `std::ranges::end`
+
+This allows all buffer algorithms to iterate uniformly regardless of whether the input is a single buffer or a multi-buffer sequence.
+
+---
+
+## 4. Customization Protocol: `tag_invoke`
+
+Capy uses two tag types for customization:
+
+### `size_tag` - customizing `buffer_size`
+
+The default `tag_invoke(size_tag, ...)` iterates all buffers and sums their sizes - O(n). User types can overload for O(1):
+
+```cpp
+std::size_t tag_invoke(size_tag const&, my_type const& x) noexcept {
+    return x.cached_size_;
+}
+```
+
+`buffer_array` does exactly this - it caches `size_` and returns it in O(1).
+
+### `slice_tag` + `slice_how` - customizing slicing
+
+A single overload handles both `remove_prefix` and `keep_prefix` via the `slice_how` enum. This forces types to implement both operations or neither, preventing irregular APIs. The free functions `keep_prefix`, `remove_prefix`, `keep_suffix`, `remove_suffix` (and their non-mutating counterparts `prefix`, `sans_prefix`, `suffix`, `sans_suffix`) all dispatch through `tag_invoke(slice_tag, ...)`.
+
+The `slice_type<T>` alias selects between:
+
+- `T` itself (if `T` has a `tag_invoke(slice_tag)` overload)
+- `slice_of<T>` (a general-purpose wrapper that tracks byte offsets into an arbitrary sequence)
+
+---
+
+## 5. Buffer Algorithms
+
+### `buffer_size`
+
+Sums `size()` across all buffers. Dispatches through `tag_invoke(size_tag)`.
+
+### `buffer_empty`
+
+Short-circuits on the first non-zero-size buffer. More efficient than `buffer_size() == 0` for large sequences.
+
+### `buffer_length`
+
+Returns the count of buffer descriptors (not bytes). Uses random-access subtraction when possible, linear counting otherwise.
+
+### `buffer_copy`
+
+The workhorse algorithm. Copies bytes from a `ConstBufferSequence` source to a `MutableBufferSequence` destination, handling the scatter/gather complexity of iterating through discontiguous regions. Uses `memcpy` on each contiguous chunk - not byte-by-byte iteration. Accepts an optional `at_most` parameter. Returns total bytes copied.
+
+The implementation maintains dual iterators (`it0`/`it1`) and position trackers (`pos0`/`pos1`) to handle partial buffer consumption at both source and destination boundaries.
+
+### `front`
+
+Returns the first buffer in a sequence, or an empty buffer if the sequence is empty. Preserves mutability.
+
+### Slice operations
+
+Eight operations in two groups:
+
+**In-place mutating** (require `tag_invoke(slice_tag)` on the type):
+
+- `keep_prefix(bs, n)` - trim to first n bytes
+- `keep_suffix(bs, n)` - trim to last n bytes (computed via `remove_prefix(size - n)`)
+- `remove_prefix(bs, n)` - drop first n bytes
+- `remove_suffix(bs, n)` - drop last n bytes (computed via `keep_prefix(size - n)`)
+
+**Non-mutating** (return a new value, wrapping in `slice_of<T>` if needed):
+
+- `prefix(bs, n)` - copy, then `keep_prefix`
+- `suffix(bs, n)` - copy, then `keep_suffix`
+- `sans_prefix(bs, n)` - copy, then `remove_prefix`
+- `sans_suffix(bs, n)` - copy, then `remove_suffix`
+
+### `slice_of<BufferSequence>`
+
+A general-purpose view over a sub-range of any buffer sequence. Stores the original sequence by value plus `begin_`, `end_` indices, `prefix_` and `suffix_` byte offsets, and `size_`. Its `const_iterator::operator*` adjusts the first and last buffer elements for the prefix/suffix byte offsets. This is the fallback when a type does not provide its own `tag_invoke(slice_tag)`.
+
+---
+
+## 6. Concrete Buffer Containers
+
+### `buffer_pair`
+
+Simple type aliases:
+
+- `const_buffer_pair = std::array<const_buffer, 2>`
+- `mutable_buffer_pair = std::array<mutable_buffer, 2>`
+
+With custom `tag_invoke(slice_tag)` overloads that can shrink individual buffers within the pair. Used by `circular_dynamic_buffer` whose data/prepare may span two discontiguous regions.
+
+### `buffer_array<N, IsConst>`
+
+A fixed-capacity array holding 0 to N buffer descriptors. Key features:
+
+- **Union-based storage** with placement new - avoids default-constructing unused slots
+- **Filters empty buffers** during construction - never stores zero-size descriptors
+- **Cached `size_`** for O(1) `buffer_size` via `tag_invoke(size_tag)`
+- **Two construction modes**: silent truncation (drops excess buffers) vs `std::in_place_t` (throws `length_error`)
+- **Span conversion** - implicit conversion to `std::span<value_type>`
+- **Custom slicing** via `tag_invoke(slice_tag)` - delegates to compiled `.cpp` helper functions
+
+Aliases: `const_buffer_array<N>` and `mutable_buffer_array<N>`.
+
+### `make_buffer`
+
+Factory function with overloads for every common container type. Returns `mutable_buffer` for mutable inputs, `const_buffer` for const inputs. Each overload has a variant with a `max_size` clamp. Supported types:
+
+- Raw `void*` / `void const*` + size
+- C arrays `T[N]`
+- `std::array<T, N>`
+- `std::vector<T, Alloc>` (requires `is_trivially_copyable<T>`)
+- `std::basic_string<CharT>` / `std::basic_string_view<CharT>`
+- `std::span<T, Extent>` (requires `sizeof(T) == 1`)
+- Any `contiguous_range` with trivially copyable elements (general fallback)
+
+---
+
+## 7. Buffer Sequence Wrappers
+
+### `buffer_param<BS>`
+
+A windowed iterator over large buffer sequences, designed for coroutine I/O loops. Maintains an internal array of up to `max_iovec_` (16) buffer descriptors, auto-refilling from the underlying sequence as windows are consumed.
+
+**Critical design for coroutines**: The outer template function must accept the buffer sequence **by value** (not by reference). When a coroutine suspends, reference parameters may dangle. `buffer_param` takes `BS const&` internally but the caller's template captures the sequence into the coroutine frame by value.
+
+API: `data()` returns the current `span<buffer_type>` window (auto-refills if exhausted), `consume(n)` advances by n bytes, `more()` checks if additional buffers remain.
+
+**Virtual interface pattern**: enables passing arbitrary buffer sequences through a virtual function boundary. The template function drives iteration; the virtual function receives a simple `span<const_buffer>`.
+
+```cpp
+class base
+{
+public:
+    task<> write(ConstBufferSequence auto buffers)
+    {
+        buffer_param bp(buffers);
+        while(true)
+        {
+            auto bufs = bp.data();
+            if(bufs.empty())
+                break;
+            std::size_t n = 0;
+            co_await write_impl(bufs, n);
+            bp.consume(n);
+        }
+    }
+
+protected:
+    virtual task<> write_impl(
+        std::span<const_buffer> buffers,
+        std::size_t& bytes_written) = 0;
+};
+```
+
+### `consuming_buffers<BufferSequence>`
+
+Wraps a buffer sequence and tracks consumption progress. Stores a reference to the original sequence plus iterator position and byte offset within the current buffer. Its `const_iterator::operator*` adjusts the first buffer for consumed bytes. Simpler than `buffer_param` but references the original sequence rather than copying descriptors.
+
+### `const_buffer_param<BS>`
+
+Alias for `buffer_param<BS, true>` - always produces `const_buffer` regardless of input mutability.
+
+---
+
+## 8. Dynamic Buffers
+
+### The `DynamicBuffer` concept
+
+Models a two-phase write protocol:
+
+1. `prepare(n)` - returns a `MutableBufferSequence` of n writable bytes
+2. `commit(n)` - makes the first n prepared bytes readable via `data()`
+3. `data()` - returns a `ConstBufferSequence` of readable bytes
+4. `consume(n)` - discards n bytes from the front of readable data
+5. `size()`, `max_size()`, `capacity()` - bookkeeping queries
+
+Required nested types: `const_buffers_type` and `mutable_buffers_type`.
+
+### Value Types vs Adapter Types
+
+Capy distinguishes two categories:
+
+- **Value types** (e.g., `flat_dynamic_buffer`) - store bookkeeping internally. Passing as rvalue to a coroutine loses state on suspend. Must be passed by lvalue reference.
+- **Adapter types** - wrap external storage (`std::string*`, `std::vector*`). Define `using is_dynamic_buffer_adapter = void`. Safe as rvalues because the external object retains the data.
+
+### `DynamicBufferParam` concept
+
+Enforces safe passing in coroutines: accepts lvalues of any `DynamicBuffer`, but rvalues only for types tagged with `is_dynamic_buffer_adapter`. This prevents silent data loss from passing value-type dynamic buffers by rvalue into coroutines.
+
+### Implementations
+
+| Type | Backing Storage | `const_buffers_type` | `mutable_buffers_type` | Growth | Adapter? |
+|------|----------------|---------------------|----------------------|--------|----------|
+| `flat_dynamic_buffer` | External `void*` + capacity | `const_buffer` | `mutable_buffer` | Fixed capacity | Yes |
+| `circular_dynamic_buffer` | External `void*` + capacity | `const_buffer_pair` | `mutable_buffer_pair` | Fixed capacity | Yes |
+| `basic_string_dynamic_buffer` | `std::string*` | `const_buffer` | `mutable_buffer` | Grows via string | Yes |
+| `basic_vector_dynamic_buffer` | `std::vector*` | `const_buffer` | `mutable_buffer` | Grows via vector | Yes |
+
+**`flat_dynamic_buffer`**: Linear buffer. `prepare`/`data` return single-element sequences (always contiguous). `consume` advances `in_pos_` without moving data. Fixed capacity set at construction.
+
+**`circular_dynamic_buffer`**: Ring buffer. Data can wrap around, so `data()` and `prepare()` may return a `buffer_pair` (two discontiguous regions). Efficient for FIFO patterns - `consume` never moves data, just advances the read pointer modulo capacity.
+
+**`string_dynamic_buffer` / `vector_dynamic_buffer`**: Adapters over `std::string*` / `std::vector<unsigned char>*`. Can grow dynamically. `consume` uses `erase` from the front (O(n) data movement). Factory function `dynamic_buffer(s)` / `dynamic_buffer(v)` creates the adapter.
+
+---
+
+## 9. Asio Interoperability
+
+Provides bidirectional conversion between Capy and Boost.Asio buffer types via `buffers/asio.hpp`:
+
+- **`to_asio(bs)`** - wraps a Capy buffer sequence so its iterators yield `asio::mutable_buffer` or `asio::const_buffer`
+- **`from_asio(bs)`** - wraps an Asio buffer sequence so its iterators yield `capy::mutable_buffer` or `capy::const_buffer`
+
+The internal `buffer_sequence_adaptor<BufferSequence, IsMutable>` class template detects which library the source belongs to (via `is_native_asio_v`) and maps each dereference to the other library's buffer type. Supports random-access iteration when the source provides it.
+
+---
+
+## 10. Higher-Level Buffer Concepts
+
+### `BufferSource`
+
+The "callee owns buffers" read-side concept. A source provides:
+
+- `pull(span<const_buffer>)` - async, fills the span with descriptors pointing to the source's internal storage, returns `(error_code, span<const_buffer>)`
+- `consume(n)` - advances the read position by n bytes
+
+Models a streaming producer. EOF is signaled via `cond::eof` error code.
+
+### `BufferSink`
+
+The "callee owns buffers" write-side concept. A sink provides:
+
+- `prepare(span<mutable_buffer>)` - synchronous, fills the span with writable buffers from the sink's internal storage
+- `commit(n)` - async, finalizes n written bytes
+- `commit_eof(n)` - async, finalizes n bytes and signals end of stream
+
+Together, `BufferSource` and `BufferSink` enable zero-copy transfer: the source exposes its internal memory, the sink exposes its internal memory, and `buffer_copy` bridges them without intermediate allocations.
+
+---
+
+## 11. Constants and Configuration
+
+- `detail::max_iovec_ = 16` - maximum buffer descriptors per `buffer_param` window. Matches common OS limits for scatter/gather (`UIO_MAXIOV` on Linux is 1024, but 16 is a practical batch size to balance setup cost vs I/O throughput).
diff --git a/doc/buffers-passing-rationale.md b/doc/buffers-passing-rationale.md
new file mode 100644
index 000000000..5f172f202
--- /dev/null
+++ b/doc/buffers-passing-rationale.md
@@ -0,0 +1,370 @@
+# Design Rationale: Buffer Sequence Passing Convention
+
+## Context
+
+This document captures the design space and trade-offs around how buffer
+sequences are passed to I/O operations in capy. The central question is
+whether the `ReadStream` and `WriteStream` concepts should mandate that
+implementations copy the buffer sequence, or whether they should accept
+by reference and leave lifetime management to the caller.
+
+A secondary question is whether the distinction between coroutine-based
+and non-coroutine-based implementations (tasks returning `io_task` vs.
+raw awaitables) changes the answer.
+
+The discussion took place between Vinnie Falco and Peter Dimov. The
+consensus is still being formed; this document records the arguments on
+both sides.
+
+## Current State
+
+The capy documentation currently states:
+
+> Buffer sequences should be accepted by value when the member function
+> is a coroutine, to ensure the sequence lives in the coroutine frame
+> across suspension points.
+
+This statement is acknowledged to be backwards. The discussion produced
+the following corrected understanding:
+
+- A function that returns an `IoAwaitable` directly (a raw awaitable,
+  not backed by a coroutine frame) must store the buffer sequence inside
+  the awaitable, because there is no coroutine frame to hold it. Taking
+  by value ensures the sequence lives in the returned object.
+
+- A function that returns an `io_task` (a coroutine) type-erases the
+  buffer sequence into its coroutine frame. In this case the caller's
+  object is referenced across suspension points through the coroutine
+  frame itself, and taking by `const&` is correct - the sequence is not
+  a temporary relative to the suspension.
+
+The guidance should read: raw awaitables take by value; coroutine-based
+tasks take by `const&`.
+
+## Background
+
+### Two Layers of Lifetime
+
+Buffer sequences have a two-layer lifetime structure:
+
+1. **The sequence object** - the iterator pair or container that
+   describes which memory regions to use. This is typically cheap to
+   copy; it holds pointers and sizes, not bytes.
+
+2. **The underlying memory** - the bytes the buffers point at. The
+   sequence does not own this memory. Whoever created the buffers is
+   responsible for keeping the memory alive until the operation
+   completes.
+
+The passing-convention debate concerns layer 1 only. Both sides agree
+that layer 2 is a harder, separate problem, and that coroutines solve it
+elegantly by anchoring the memory in the coroutine frame.
+
+### The Asio Precedent
+
+Asio's specification requires:
+
+> If a read or write operation is also an asynchronous operation, the
+> operation shall maintain one or more copies of the buffer sequence
+> until the operation no longer requires access to the memory specified
+> by the buffers in the sequence.
+
+This is a mandatory copy of the sequence object (layer 1). Asio requires
+`CopyConstructible` as a consequence. The copy keeps the sequence alive
+across the multiple `async_read_some` / `async_write_some` calls that
+compose the full operation.
+
+The question is whether capy should follow this precedent or loosen it
+for the coroutine-first context.
+
+## The Case For Mandatory Copy (By-Value)
+
+### Correctness by Default
+
+Taking the buffer sequence by value when returning a raw awaitable is
+the only way to guarantee correctness regardless of call pattern. The
+concept does not know whether the caller will immediately `co_await` the
+result or store the awaitable and `co_await` it later:
+
+```cpp
+unsigned char buffer[1024];
+auto aw = stream.read_some(mutable_buffer(buffer, sizeof(buffer)));
+// ... if read_some took by const&, aw now holds a dangling pointer ...
+co_await aw;
+```
+
+If `read_some` takes by `const&` or `&&` without storing a copy, the
+caller who defers the `co_await` has undefined behavior. Taking by value
+eliminates this class of bug.
+
+### Detached Awaitables and Senders
+
+Type-erasing stream wrappers capture the awaitable rather than
+`co_await`-ing it inline. The sender bridge in capy captures awaitables
+and runs them as sender operations. Both patterns require the awaitable
+to be self-contained. A by-value sequence in the awaitable makes this
+safe. A reference does not.
+
+### Owning Buffer Sequences
+
+The buffer sequence concept does not preclude owning sequences - types
+that hold a `shared_ptr` to their memory and expose `const_buffer`
+iterators. The Asio documentation and example code (see
+`reference_counted.cpp`) demonstrate this pattern explicitly: the
+reference count is the mechanism by which the memory lifetime is tied to
+the operation lifetime. A guaranteed copy of the sequence is what makes
+this work - when the last copy is destroyed, the reference count drops
+and the memory is released.
+
+Without a guaranteed copy, the owning-sequence pattern requires the
+caller to manage lifetime explicitly, defeating the purpose.
+
+### Arrays Are Not Copyable
+
+C language arrays (`const_buffer buf[N]`) are not copyable. If the
+concept requires `CopyConstructible`, language-level arrays cannot be
+passed directly. This is not a reason to drop the copy requirement - it
+is a reason to use `std::array<const_buffer, N>` instead. The
+distinction is intentional: `std::array` is a first-class range with
+copy semantics; C arrays are not.
+
+## The Case Against Mandatory Copy (By-Reference)
+
+### The Copy Is Not Free for All Sequences
+
+While copying a `mutable_buffer` or `std::array<const_buffer, 4>` is
+cheap, the concept does not bound the number of buffers in the sequence.
+A sequence representing one million scatter-gather regions is legitimate
+and not uncommon in high-throughput networking. Mandating a copy of
+every such sequence at every call site is a performance tax that
+accumulates.
+
+The `buffer_array` type in capy arose as a workaround: it avoids
+initializing capacity on construction precisely because the copy is
+non-trivial at scale. This is a self-inflicted problem if the copy is
+mandatory.
+
+### Wrapping Streams Are Penalized Twice
+
+A stream wrapper that reads from an inner stream and post-processes the
+data (e.g., XOR, compression, TLS) must pass the buffer sequence down to
+the inner stream. If the concept mandates by-value, the wrapper copies
+the sequence on entry, then the inner stream copies it again. Two
+copies, neither necessary:
+
+```cpp
+template<class ReadStream> class xoring_stream
+{
+    ReadStream& s_;
+public:
+    template<class Buffers>
+    io_task<size_t> read_some(Buffers buffers) // first copy here
+    {
+        auto [ec, n] = co_await s_.read_some(buffers); // second copy here
+        xor_buffers(buffers, n);
+        co_return { ec, n };
+    }
+};
+```
+
+This problem is compounded when the concept mandates by-value
+unconditionally, because the wrapper cannot use the more appropriate
+`Buffers const&` signature - the concept forces its hand.
+
+### Coroutines Make the Copy Unnecessary
+
+In a coroutine-based design, the caller's buffer sequence is in the
+caller's coroutine frame. When the caller `co_await`s the operation, the
+caller is suspended and the frame - including the buffer sequence - stays
+alive until the operation completes. No copy is needed. Requiring a copy
+anyway is a cost that buys nothing in the common case.
+
+### Start Without, Add Later If Needed
+
+If capy ships without mandatory copies, implementations that need them
+(for detached awaitables, senders, owning sequences) can make them
+explicitly. The converse is not true: if capy mandates copies, the
+design calcifies around the copy and there is no path to removing it
+later. Starting without the copy preserves optionality.
+
+## Key Tension Points
+
+### By-Value vs. By-`&&`
+
+A forwarding reference (`Buffers&&`) is an alternative to by-value that
+avoids copying lvalue sequences:
+
+```cpp
+template<class Buffers>
+auto read_some(Buffers&& buf)
+{
+    struct aw {
+        stream* s_;
+        Buffers buf_; // deduced as reference type for lvalues
+    };
+    return aw{ this, std::forward<Buffers>(buf) };
+}
+```
+
+The problem: when `Buffers` is deduced as an lvalue reference, `buf_`
+is a reference member. The awaitable then holds a reference to the
+caller's object, which may go out of scope before `co_await`. The
+by-`&&` approach is only safe for rvalues; for lvalues it silently
+introduces the same dangling-reference hazard that by-value avoids.
+
+Correcting this requires `std::decay_t<Buffers>` for the stored type,
+which makes by-`&&` equivalent to by-value for the storage decision.
+
+### Tasks vs. Raw Awaitables
+
+The passing convention differs by return type:
+
+- `io_task<size_t> read_some(Buffers const& buf)` - the coroutine frame
+  is the storage. The caller's object is kept alive by the `co_await`
+  chain. By-`const&` is correct and no copy is made.
+
+- `IoAwaitable auto read_some(Buffers buf)` - the returned object is the
+  storage. The awaitable must be self-contained. By-value is required.
+
+The concept as written applies to both cases with a single signature,
+which creates the tension. A concept that admits both signatures (and
+distinguishes them by return type) would resolve the ambiguity, at the
+cost of a more complex concept.
+
+### The Array Problem
+
+The `io_task` path accepts `const_buffer buf[N]` today because it takes
+by `const&` and does not make a copy. If the concept is tightened to
+require a copy, raw arrays are excluded. The fix is `std::array`, but
+this is a source-compatibility break for any code that passes C arrays
+today.
+
+## Areas of Agreement
+
+1. **The underlying memory lifetime is a separate, harder problem.**
+   Coroutines solve this by keeping the memory in the coroutine frame.
+   Buffer sequence passing convention does not affect this.
+
+2. **A `mutable_buffer` or `const_buffer` by itself is cheap to copy.**
+   The dispute is about sequences of buffers at scale.
+
+3. **Language-level arrays (`T[]`) are second-class.** They are not
+   copyable. Code that needs to pass buffer sequences should use
+   `std::array` or a container. The concept should not be weakened to
+   accommodate `T[]`.
+
+4. **Raw awaitables must store a copy.** A function returning an
+   `IoAwaitable` directly, without a backing coroutine frame, must
+   store the buffer sequence in the returned object. By-value is the
+   only safe signature.
+
+5. **Coroutine-based tasks can accept by `const&`.** When the function
+   is a coroutine returning `io_task`, the sequence lives in the
+   caller's frame across the `co_await`. No copy is needed by the
+   callee.
+
+## Areas of Disagreement
+
+1. **Whether the concept should mandate the copy.** One view holds that
+   mandating a copy is correct by default and the cost is acceptable
+   because sequences are supposed to be cheap. The other holds that the
+   concept should not constrain implementations unnecessarily and callers
+   who need a copy should make one explicitly.
+
+2. **Whether detached awaitables and senders are a primary concern.**
+   One view holds that these patterns are real and the concept must
+   accommodate them safely. The other holds that they are niche cases
+   and should not drive the default API design.
+
+3. **Whether owning sequences justify the copy.** One view holds that
+   the owning-sequence pattern (using `shared_ptr` to tie memory
+   lifetime to operation lifetime) is a legitimate and useful pattern
+   that requires a guaranteed copy. The other holds that coroutines
+   eliminate the need for this pattern and it should not drive the
+   concept design.
+
+4. **Whether to start permissive or restrictive.** One view holds that
+   starting without the copy and adding it later is the right
+   engineering approach - remove requirements you don't need. The other
+   holds that correctness by default is worth the cost, and relaxing
+   later is harder than tightening.
+
+## Summary
+
+| Property                          | By-Value (Copy)      | By-Reference (No Copy) |
+| --------------------------------- | -------------------- | ---------------------- |
+| Raw awaitable safety              | Guaranteed           | Requires discipline    |
+| Detached awaitable / sender safety | Guaranteed          | Requires discipline    |
+| Owning sequences                  | Supported            | Not supported          |
+| Coroutine overhead                | Unnecessary copy     | None                   |
+| Wrapping stream overhead          | Two copies           | None                   |
+| Large scatter-gather sequences    | Costly               | Free                   |
+| C array compatibility             | Excluded             | Works                  |
+| Start permissive, tighten later   | No                   | Yes                    |
+
+The core tension is between safety by default (by-value) and
+implementation freedom (by-reference). The by-value convention
+eliminates a class of lifetime bugs for detached awaitables and owning
+sequences, at the cost of unnecessary copies in the coroutine case and
+a penalty for large sequences and wrapping streams. The by-reference
+convention is appropriate for coroutine-based tasks but unsafe for raw
+awaitables without caller discipline.
+
+A complete resolution likely requires distinguishing the two cases in
+the concept itself: raw awaitables mandate by-value; coroutine tasks
+accept by `const&`. Whether the standard should mandate that
+implementations keep at least one copy alive for the duration of the
+operation - regardless of how the parameter is passed - remains an open
+question.
+
+## Resolution
+
+Tracked in [cppalliance/capy#263](https://github.com/cppalliance/capy/issues/263).
+
+**All capy I/O entry points that accept a buffer sequence take it by
+value.** This applies uniformly to member operations (`read_some`,
+`write_some`) and to the free-function composed operations (`read`,
+`write`).
+
+The "distinguish the two cases" idea floated in the section above did
+not survive scrutiny. The argument that coroutine tasks can safely
+accept by `const&` because "the sequence is not a temporary relative
+to the suspension" assumed that the coroutine body has run by the
+point of suspension. With lazy coroutines, it has not:
+
+```cpp
+auto aw = capy::read(stream, mutable_buffer{p, n});
+// The temporary mutable_buffer dies here, at the end of the
+// full-expression. The coroutine body has not begun executing.
+auto [ec, k] = co_await std::move(aw);
+// If read() took the sequence by const&, this co_await
+// dereferences a dangling reference.
+```
+
+By-rvalue-reference (`Buffers&&`) fails for the same reason: the
+coroutine has no opportunity to copy the rvalue into its frame before
+the full-expression ends and the rvalue is destroyed.
+
+**By-value is therefore the only safe convention for any lazy
+awaitable** - coroutine-backed or not. The same rule applies to raw
+awaitables for the reasons already given (the awaitable must own its
+state to support sender pipelines and detached storage).
+
+### Caller-side workaround for expensive sequences
+
+The Asio-style assumption that buffer sequences are cheap to copy
+still leaks through in cases like `std::vector<mutable_buffer>` with
+many entries. Callers in that situation can opt into a reference-like
+view at the call site:
+
+```cpp
+std::vector<mutable_buffer> bufs = /* many entries */;
+auto [ec, n] = co_await capy::read(stream, std::views::all(bufs));
+```
+
+`std::views::all(bufs)` produces a `std::ranges::ref_view` that
+satisfies the buffer-sequence concepts and copies in O(1). The caller
+takes on the lifetime obligation in exchange for the cheap copy -
+the same trade-off any reference-passing convention would impose,
+but now opt-in and visible at the call site rather than baked into
+the API.
diff --git a/doc/buffers-peter.md b/doc/buffers-peter.md
new file mode 100644
index 000000000..0f333f9b2
--- /dev/null
+++ b/doc/buffers-peter.md
@@ -0,0 +1,105 @@
+Peter's position on buffer sequence design in Capy:
+
+---
+
+We need to clarify our approach for manipulating slices (byte ranges) of buffer sequences. We can use the implementation of read as a canonical example of this need.
+
+Our current implementation is
+
+    consuming_buffers consuming(buffers);
+    std::size_t const total_size = buffer_size(buffers);
+    std::size_t total_read = 0;
+
+    while(total_read < total_size)
+    {
+        auto [ec, n] = co_await stream.read_some(consuming);
+        consuming.consume(n);
+        total_read += n;
+        if(ec)
+            co_return {ec, total_read};
+    }
+
+    co_return {{}, total_read};
+
+which is actually a legitimate approach to things - have a stateful slice type (not necessarily called consuming_buffers), construct it over the passed buffer sequence, then iterate by alternately passing it to read_some and removing a prefix.
+
+One change we can make to the above (besides the name of the slice class) would be to not require it to be a buffer sequence itself. Instead of co_await read_some(consuming), we can have co_await read_some(consuming.data()), with the buffer sequence only produced on demand.
+
+In earlier discussions, however, I was told that the above is a temporary implementation subject to be replaced with the real one, the real one being something along the lines of
+
+    std::size_t const total_size = buffer_size(buffers);
+    std::size_t total_read = 0;
+
+    auto seq = sans_prefix(buffers, 0);
+
+    while(total_read < total_size)
+    {
+        auto [ec, n] = co_await stream.read_some(seq);
+        seq = remove_prefix(seq, n);
+        total_read += n;
+        if(ec)
+            co_return {ec, total_read};
+    }
+
+    co_return {{}, total_read};
+
+The implication here is that sans_prefix(buffers, 0) would be required to return a buffer sequence seq such that the result of calling remove_prefix on it would be assignable to seq.
+
+While that's possible to implement, I don't like it one bit; the specifications of sans_prefix and remove_prefix become entangled with special cases, and having the "begin" operation be spelled sans_prefix(buffers, 0) is kind of stupid.
+
+Essentially, this implements our current approach, but spells the operations and the iteration state type in a weird manner. I would prefer the approach of naming the iteration state type and its operations explicitly.
+
+There's one alternative we haven't considered, though, and towards which I have gravitated. We can also eliminate the need for a slice-producing type by pushing the responsibility of handling slices onto the implementers of read_some and write_some.
+
+That is, we can add offset and length parameters to read_some, with the result being
+
+    std::size_t const total_size = buffer_size(buffers);
+
+    std::size_t offset = 0, length = total_size;
+
+    while(offset < total_size)
+    {
+        auto [ec, n] = co_await stream.read_some(buffers, offset, length);
+        offset += n;
+        length -= n;
+        if(ec)
+            co_return {ec, offset};
+    }
+
+    co_return {{}, offset};
+
+This doesn't increase the total implementation complexity compared to the user passing a slice type to read_some, because we can still provide the slice type, and the implementer of read_some can still implement the offset+length function in terms of the existing offsetless/lengthless function by passing it a slice. However, in many cases the implementation of read_some can take advantage of offset and length natively, without much additional complexity.
+
+---
+
+At the moment, read_some is specified as
+
+template<MutableBufferSequence Buffers>
+IoAwaitable auto read_some(Buffers buffers);
+That is, it takes buffers by value.
+
+This doesn't seem correct. It would imply that std::vector<mutable_buffer> is required to be copied by read_some, which is unnecessary.
+
+And if we change this to e.g. Buffers&& buffers, we need to clarify that read_some doesn't mutate buffers in the lvalue case, so that the caller can still use buffers in a subsequent read_some (or write_some) call.
+
+The rvalue case is trickier, but we need to have clarity on #261 before deciding.
+
+---
+
+We should remove all tag_invoke customization points pertaining to buffer sequences, and all custom buffer sequence types that only exist because they customize an operation. Buffer sequences should be generic ranges of const_bufffer or mutable_buffer (or more precisely, any generic range should be accepted as a buffer sequence; const_buffer and mutable_buffer by itself would still be able to serve as buffer sequences.)
+
+---
+
+It will be convenient to have a read_at_least algorithm that is a straightforward extension of read. While read reads exactly buffer_size(buffers) bytes, read_at_least would take the minimum amount of bytes as a parameter, and the only change would be in the loop condition. Instead of while(bytes_read < buffer_size(buffers)), it would be while(bytes_read < bytes_requested).
+
+One motivating example can be found here:
+
+https://github.com/pdimov/corosio_protocol_bench/blob/ea373f3f9e3c1945627c85f24fb9c256128bb11a/buffered_socket_source.hpp#L62
+
+The "buffered source" implementation needs to read the n bytes requested by the user, and to fill its buffer, with a single invocation. While n is a required amount and must be met or exceeded, the subsequent N bytes filling the buffer are optional and there's no need to block or loop for them.
+
+I don't have a motivating example for write_at_least, but we should provide it for consistency and symmetry.
+
+The one subtlety here is that it's possible for the user to pass parameters that are impossible to satisfy (if the requested minimum amount of bytes exceeds buffer_size(buffers). In this case, I believe that the function should fail immediately, with {EINVAL, 0}.
+
+---
diff --git a/doc/combinator-gap-analysis.md b/doc/combinator-gap-analysis.md
new file mode 100644
index 000000000..4a3a0910f
--- /dev/null
+++ b/doc/combinator-gap-analysis.md
@@ -0,0 +1,160 @@
+# Combinator Gap Analysis: Coroutine-Native vs. Sender/Receiver
+
+March 2026.
+
+---
+
+## Scope
+
+This document compares the combinator capabilities of two models:
+
+- **Coroutine-native** (`io::when_all`, `io::when_any`) as specified in `combinators.md`
+- **Sender/receiver** (`std::execution::when_all`) as specified in P2300R10
+
+The comparison assumes the sender model routes I/O results correctly: `!ec` goes to `set_value(T...)`, `ec` goes to `set_error(ec)`. This routing happens in the coroutine body - the translation layer described in P4093R0 (Falco, Gerbino). The coroutine absorbs the compound `io_result{ec, n}`, uses `n` on the success path, and propagates `ec` on the failure path. By the time the result crosses the sender boundary, it is clean: value or error, never both.
+
+---
+
+## 1. The Routing Decision
+
+The three-channel model works for I/O when the compound result is decomposed before crossing the sender boundary:
+
+```cpp
+// Translation layer (P4093)
+capy::task<std::error_code>
+read_all(auto& stream, auto buf)
+{
+    auto [ec, n] = co_await capy::read(
+        stream, buf);
+    if (ec)
+        co_return ec;
+    // use n...
+    co_return {};
+}
+```
+
+The coroutine body is the routing decision. `!ec` produces a value. `ec` produces an error. The sender boundary sees one or the other, never both. The three channels work as designed.
+
+---
+
+## 2. when_all Comparison
+
+Assuming correct routing (`!ec` -> `set_value`, `ec` -> `set_error`):
+
+| # | Scenario | Coroutine-native | Sender | Gap? |
+|---|----------|------------------|--------|------|
+| 1 | All tasks succeed | `io_result<R1,...,Rn>` with `!ec` | `tuple<R1,...,Rn>` via `set_value` | Ergonomic only |
+| 2 | One task returns `ec` | Inspects `ec`. Cancels siblings. | `set_error(tuple(ec, T...))`. Cancels siblings. | None |
+| 3 | Multiple `ec` concurrently | First `ec` wins. | First `set_error` wins. | None |
+| 4 | EOF, `n == 0` | Error. Cancel. | `set_error(tuple(eof, 0))`. Cancel. | None |
+| 5 | Partial transfer (`ec`, `n > 0`) | Error. Cancel. Partial bytes stored. | `set_error(tuple(ec, n))`. Cancel. `n` preserved in error payload. | None |
+| 6 | Zero-length buffer, success | Success. | `set_value`. | None |
+| 7 | Zero-length buffer, `ec` | Error. Cancel. | `set_error(tuple(ec, 0))`. Cancel. | None |
+| 8 | One task throws | Exception. Cancel. Rethrow. | `set_error(exception_ptr)`. Cancel. Rethrow. | None |
+| 9 | Multiple throws | First exception wins. | First `set_error` wins. | None |
+| 10 | Exception vs. `ec` | Exception wins. | Both are `set_error`. Distinguishable by type. | Minor (type dispatch needed) |
+| 11 | Parent stop | ECANCELED, standard behavior. | `set_stopped`. Standard behavior. | None |
+| 12 | All tasks fail | First `ec` wins. | First `set_error` wins. | None |
+| 13 | Failure reporting | Outer `io_result`'s `ec`. | `set_error` channel. | Ergonomic only |
+| 14 | Return type | `io_result<R1,...,Rn>`. One `ec`, flat destructuring. | `tuple<R1,...,Rn>`. No `ec` lifting. | Ergonomic only |
+
+**No structural gaps.** The sender `when_all` can do everything `io::when_all` can do when the routing is correct. With `set_error(tuple(ec, T...))`, partial bytes are preserved in the error payload. The differences are ergonomic: the coroutine-native return type (`io_result<R1,...,Rn>`) is more convenient than `tuple<R1,...,Rn>`. Neither difference affects capability.
+
+---
+
+## 3. when_any Comparison
+
+P2300 does not define `when_any`. A custom sender `when_any` can be written. Assuming correct routing:
+
+| # | Scenario | Coroutine-native | Sender | Gap? |
+|---|----------|------------------|--------|------|
+| 1 | First task succeeds | Winner by `!ec`. Cancel siblings. | Winner by `set_value`. Cancel siblings. | None |
+| 2 | One task returns `ec`, others pending | Does not win. Keep waiting. | `set_error` fires. Does not win. Keep waiting. | None |
+| 3 | One succeeds, one failed | Successful task wins. | `set_value` wins over `set_error`. | None |
+| 4 | All tasks fail | `error_code` at variant index 0. Unspecified which. | Last `set_error` propagated (or unspecified). | None |
+| 5 | One throws, others pending | Does not win. Keep waiting. | `set_error(exception_ptr)` does not win. Keep waiting. | None |
+| 6 | All throw | Rethrow first. | Rethrow first. | None |
+| 7 | Parent stop | ECANCELED at variant index 0. | `set_stopped`. | None |
+| 8 | EOF, `n == 0` | Does not win. | `set_error(eof)`. Does not win. | None |
+| 9 | Immediate completion | Wins normally. | Wins normally. | None |
+| 10 | Return type | `variant<error_code, R1,...,Rn>`. Index 0 is failure. | Implementation-defined. | Ergonomic only |
+
+**No structural gaps.** A sender `when_any` that accepts `set_value` completions as winners and ignores `set_error` completions does exactly what `io::when_any` does. The channel model provides the hook: `set_value` means success, `set_error` means failure.
+
+---
+
+## 4. Where the Routing Happens
+
+The key insight is that the compound result never needs to cross the sender boundary intact. The coroutine body decomposes it:
+
+```
+I/O primitive -> io_result{ec, n}
+                      |
+              coroutine body (translation layer)
+                      |
+            +--------------------+
+            |                    |
+       !ec: set_value(T...)   ec: set_error(ec)
+```
+
+This is P4093's contribution. The coroutine body is not overhead - it is the mechanism that makes the three-channel model work for I/O. Without it, the compound result must be routed whole, and the trilemma from P2430R0 applies. With it, the result is decomposed at the source, and the channels work as designed.
+
+Both models require this coroutine body. In the coroutine-native model, the body feeds `io::when_all` directly. In the sender model, the body feeds `set_value`/`set_error`. The decomposition is identical. The downstream combinator sees the same classification.
+
+---
+
+## 5. What Differs
+
+The gaps are ergonomic, not structural:
+
+### Return type convenience
+
+Coroutine-native `io::when_all` returns `io_result<R1, R2, ..., Rn>` - one `ec`, flat destructuring:
+
+```cpp
+auto [ec, n1, n2, n3] = co_await io::when_all(
+    read(s1), read(s2), read(s3));
+```
+
+Sender `when_all` returns `tuple<R1, R2, R3>` where each `Ri` is `tuple<Ti...>` (the value pack from that child):
+
+```cpp
+auto [r1, r2, r3] = co_await when_all(
+    read(s1), read(s2), read(s3));
+// Each ri is tuple<Ti...> from that child's set_value
+```
+
+### Partial bytes on failure
+
+With `set_error(tuple(ec, T...))` routing, partial bytes are preserved in the error payload. Both models retain the same information on the failure path.
+
+### when_any return type
+
+Coroutine-native returns `variant<error_code, R1, ..., Rn>` with error at index 0. A sender `when_any` would need to define its own return type convention.
+
+### No standard when_any
+
+P2300 does not define `when_any`. One must be written. The coroutine-native model ships it.
+
+---
+
+## 6. What the Coroutine Model Cannot Do
+
+| Capability | Sender model | Coroutine model |
+|------------|--------------|-----------------|
+| Heterogeneous execution contexts per child | Yes (each sender carries its own scheduler) | Limited (children share the parent's executor) |
+| Compile-time work graph composition | Yes (sender chains are types) | No (coroutines are runtime) |
+| GPU dispatch | Yes (schedulers abstract hardware) | No (coroutines require a CPU stack) |
+| Lazy evaluation without coroutine frame | Yes (senders are lazy values) | No (coroutine frame allocated at call) |
+
+These are the sender model's strengths. They are orthogonal to I/O combinators.
+
+---
+
+## 7. Conclusion
+
+There are no structural gaps between the two models for I/O combinators. The three-channel model works for I/O when the compound result is decomposed in the coroutine body before crossing the sender boundary. This decomposition is the translation layer described in P4093R0.
+
+The differences are ergonomic. The coroutine-native model provides a more convenient return type (`io_result<R1,...,Rn>` with one `ec` and flat destructuring), preserves partial bytes on the failure path, and ships `when_any` out of the box. The sender model requires the user to write `when_any` and returns less convenient types, but can express the same semantics.
+
+Both models require a coroutine body to decompose the compound result. The coroutine body is the bridge between I/O's compound results and any combinator model - whether channel-based or value-based. The question is not which model handles compound results better at the combinator level. Both handle them identically, because both rely on the same coroutine body to decompose them first.
diff --git a/doc/combinator-spec.md b/doc/combinator-spec.md
new file mode 100644
index 000000000..ebc184f57
--- /dev/null
+++ b/doc/combinator-spec.md
@@ -0,0 +1,202 @@
+# io::when_all / io::when_any - Behavior Specification
+
+Team working document. March 2026.
+
+---
+
+## Design Basis
+
+`io::when_all` and `io::when_any` are I/O-aware combinators. They know the result convention: `tuple<error_code, T...>`. Success means `!ec`. Failure means `ec`.
+
+The primitive resolves ambiguity at the source: when `bytes_transferred == bytes_requested`, the primitive returns `({}, n)`. By the time the combinator sees the result, `!ec` is reliable. The combinator only inspects the error_code. It does not interpret `T...`.
+
+The synchronous analogy governs the model:
+
+```cpp
+std::tuple result{ f1(), f2(), ..., fN() };
+```
+
+If any of `f1`...`fN` fails, you get the error - not a tuple of mixed results. The tuple only exists on the success path.
+
+---
+
+## io::when_all Behavior Table
+
+| #  | Scenario                                  | Behavior                                                                 | Status      |
+| -- | ----------------------------------------- | ------------------------------------------------------------------------ | ----------- |
+| 1  | All tasks return `!ec`                    | Return tuple of all results. No cancellation.                            | Settled     |
+| 2  | One task returns `ec`, others pending     | Cancel siblings. Propagate error.                                        | Settled     |
+| 3  | Multiple tasks return `ec` concurrently   | Each triggers stop (idempotent). First ec wins.                          | Settled     |
+| 4  | `ec == eof`, `n == 0`                     | Error. Cancel siblings.                                                  | Settled     |
+| 5  | `ec != 0`, `n > 0` (partial transfer)    | Error. Cancel siblings. Values stored as-is (discarding would be the special case given the return type). Caller sees `ec` and knows it failed; partial values are available but not guaranteed meaningful. | Settled     |
+| 6  | Zero-length buffer, `({}, 0)`             | Success. No cancellation.                                                | Settled     |
+| 7  | Zero-length buffer, `(ec, 0)`             | Error (ec reflects stream state). Cancel siblings.                       | Settled     |
+| 8  | One task throws                           | Capture exception. Cancel siblings. Rethrow after all complete.          | Settled     |
+| 9  | Multiple tasks throw                      | First exception captured. Others discarded. Rethrow first.               | Settled     |
+| 10 | One throws, another returns `ec` (either order) | Exception always wins. No value to return through an exception. Tuple is not accessible. | Settled     |
+| 11 | Parent stop token fires                   | Not a special case. Children return ECANCELED, which is an error like any other. First ec wins, standard error behavior. | Settled     |
+| 12 | All tasks fail                            | Propagate single error_code (first wins). Not a tuple of failures.       | Settled     |
+| 13 | How does failure reach the caller?        | Via the outer `io_result`'s `ec`. See Return Type below.                 | Settled     |
+| 14 | Return type                               | `io_result<R1, R2, ..., Rn>` where each `Ri` is `Ti` if child has single `T`, or `tuple<Ti...>` if child has multiple `T`s. See Return Type below. | Settled     |
+
+---
+
+## io::when_all Return Type
+
+Peter Dimov's proposal: the return type lifts the error_code out of each child's `io_result` into a single outer `io_result`. Child value types are collected as parameters.
+
+Given children returning `io_result<T1...>`, `io_result<T2...>`, ..., `io_result<Tn...>`:
+
+```
+io::when_all(child1, child2, ..., childN)
+  -> io_result<R1, R2, ..., Rn>
+```
+
+Where each `Ri` is:
+- `Ti` directly, if the child returns `io_result<Ti>` (single type - no wrapping)
+- `tuple<Ti...>` if the child returns `io_result<Ti1, Ti2, ...>` (multiple types)
+- `tuple<>` if the child returns `io_result<>` (no value types)
+
+### Examples
+
+Three reads, each returning `io_result<size_t>`:
+
+```cpp
+auto [ec, n1, n2, n3] = co_await io::when_all(
+    stream.read_some(buf1),
+    stream.read_some(buf2),
+    stream.read_some(buf3));
+// ec: single error_code (first failure wins)
+// n1, n2, n3: size_t values (only meaningful when !ec)
+```
+
+Mixed result types:
+
+```cpp
+// task_a returns io_result<size_t>
+// task_b returns io_result<size_t, flags>
+// task_c returns io_result<message>
+auto [ec, n, tf, msg] = co_await io::when_all(
+    task_a, task_b, task_c);
+// ec: single error_code
+// n: size_t from task_a
+// tf: tuple<size_t, flags> from task_b
+// msg: message from task_c
+```
+
+Void results (`io_result<>`):
+
+```cpp
+// task_a returns io_result<size_t>
+// task_b returns io_result<>  (just error_code, no values)
+// task_c returns io_result<size_t>
+auto [ec, n1, empty, n2] = co_await io::when_all(
+    task_a, task_b, task_c);
+// ec: single error_code
+// n1: size_t from task_a
+// empty: tuple<> from task_b
+// n2: size_t from task_c
+```
+
+`io_result<>` contributes a `tuple<>` to the outer parameter list. Same rule as multiple types (`tuple<T...>`), applied to zero types.
+
+### Why this works
+
+- **Binary outcome.** Caller checks one `ec`. On success, destructures the values. On failure, handles one error. Matches the synchronous analogy.
+- **No redundant error codes.** Child error codes are stripped. On the success path they were all zero. On the failure path only the first matters.
+- **Natural destructuring.** Single-type children flatten into the parameter list. `auto [ec, n1, n2, n3]` just works.
+- **Consistent with io_result convention.** The result is itself an `io_result`, so it composes with anything that already handles `io_result`.
+
+---
+
+## io::when_any Behavior Table
+
+`io::when_any` selects a winner. A task wins by succeeding: `!ec`. Tasks that fail or throw do not win.
+
+| #  | Scenario                                  | Behavior                                                                 | Status      |
+| -- | ----------------------------------------- | ------------------------------------------------------------------------ | ----------- |
+| 1  | First task to return `!ec`                | Wins. Cancel siblings. Return winner's result.                           | Settled     |
+| 2  | One task returns `ec`, others pending     | Does not win. Keep waiting.                                              | Settled     |
+| 3  | One succeeds, one already failed          | Successful task wins.                                                    | Settled     |
+| 4  | All tasks return `ec` (all fail)          | No winner. Variant holds `error_code` at index 0. Which child's ec is unspecified (likely last, as that is the natural implementation result). | Settled     |
+| 5  | One task throws, others pending           | Exception does not win. Keep waiting for a success.                      | Settled     |
+| 6  | All tasks throw                           | No success possible. Rethrow first exception.                            | Settled     |
+| 7  | Parent stop fires before any completion   | All children cancelled. Variant holds `error_code` at index 0 (ECANCELED). | Settled     |
+| 8  | `ec == eof`, `n == 0`                     | Error. Does not win.                                                     | Settled     |
+| 9  | Immediate completion (await_ready true)   | Wins normally. No special treatment.                                     | Settled     |
+| 10 | Return type                               | `variant<error_code, R1, R2, ..., Rn>`. Index 0 is error_code (failure/no winner). Index 1..N are success results. See Return Type below. | Settled     |
+
+---
+
+## io::when_any Return Type
+
+Option B (Peter Dimov), with `error_code` first:
+
+```
+io::when_any(child1, child2, ..., childN)
+  -> variant<error_code, R1, R2, ..., Rn>
+```
+
+Where each `Ri` follows the same rules as `when_all`:
+- `Ti` directly, if the child returns `io_result<Ti>` (single type)
+- `tuple<Ti...>` if the child returns `io_result<Ti1, Ti2, ...>` (multiple types)
+- `tuple<>` if the child returns `io_result<>` (no value types)
+
+Index 0 (`error_code`) is the failure/no-winner case. Index 1..N identifies which child won.
+
+### Examples
+
+Three reads, each returning `io_result<size_t>`:
+
+```cpp
+auto result = co_await io::when_any(
+    stream.read_some(buf1),
+    stream.read_some(buf2),
+    stream.read_some(buf3));
+// result: variant<error_code, size_t, size_t, size_t>
+
+if (result.index() == 0)
+    // all failed: std::get<0>(result) is the error_code
+else
+    // result.index() - 1 is the winning child
+```
+
+Mixed result types:
+
+```cpp
+// task_a returns io_result<size_t>
+// task_b returns io_result<message>
+auto result = co_await io::when_any(task_a, task_b);
+// result: variant<error_code, size_t, message>
+
+if (result.index() == 1)
+    // task_a won: std::get<1>(result) is size_t
+else if (result.index() == 2)
+    // task_b won: std::get<2>(result) is message
+else
+    // failure: std::get<0>(result) is error_code
+```
+
+### Why this works
+
+- **Winner identification.** `result.index() - 1` maps directly to the child index. No separate index field needed.
+- **Error at index 0.** Failure is always `index() == 0`. Simple boolean check: `if (result.index() == 0)`.
+- **No redundant error codes.** Winners have `!ec` by definition. The child's error_code is stripped. Only the failure case carries an error_code.
+- **Consistent with when_all.** Same `Ri` flattening rules. Same error_code stripping. Different container (variant vs io_result).
+
+---
+
+## Status
+
+All rows settled. Both tables complete. No open questions remain.
+
+---
+
+## Contributors
+
+- Peter Dimov: primitive normalization rule, binary success/failure model, "first ec wins" semantics, outer io_result return type proposal, corrections to rows 5/10/12
+- Andrzej Krzemienski: error_code is status not error, three-bucket classification (success/failure/cancellation)
+- Michael Vandeberg: predicate-based combinator design (superseded by concrete io:: approach), when_any winner selection, issues 204-206
+- Vinnie Falco: ssl_stream_truncated counterexample, partial transfer preservation question
+- Richard: concrete use cases (NNTP, HTTP HEAD mirroring)
+- Mungo Gill: translation layer / D4056 error handling
diff --git a/doc/continuation-rationale.md b/doc/continuation-rationale.md
new file mode 100644
index 000000000..c69d3c97a
--- /dev/null
+++ b/doc/continuation-rationale.md
@@ -0,0 +1,529 @@
+# Design Rationale: Continuation Type in the Executor Interface
+
+## Context
+
+This document captures the design space and trade-offs around replacing
+`std::coroutine_handle<>` with a first-class `continuation` type in
+capy's executor interface. The central question is whether the executor
+concept should traffic in raw coroutine handles or in a richer type that
+carries intrusive queue metadata. Secondary questions address where the
+`continuation` object lives, how it is passed, and what this means for
+the promise base, the `IoAwaitable` protocol, and downstream consumers
+like corosio.
+
+The consensus was reached through discussion and prototyping. The
+implementation ships as a breaking change to the `Executor` concept.
+
+## Current Consensus
+
+The executor concept adopts `continuation&` as the parameter type for
+`dispatch` and `post`:
+
+```cpp
+struct continuation
+{
+    std::coroutine_handle<> h;
+    continuation* next = nullptr;
+};
+
+concept Executor = requires(E& e, continuation c) {
+    { e.dispatch(c) } -> std::same_as<std::coroutine_handle<>>;
+    { e.post(c) };
+    // ...
+};
+```
+
+Both fields are public. The `continuation` lives in the I/O awaitable
+for caller-handle posting, and in combinator/trampoline state for
+parent-dispatch and child-launch patterns. The `IoAwaitable` concept
+is unchanged. The promise base (`io_awaitable_promise_base`) is
+unchanged. The `dispatch` return type remains `std::coroutine_handle<>`
+for symmetric transfer.
+
+The rationale for these choices follows.
+
+## Background
+
+### The Executor Bottleneck
+
+Every coroutine resumption in capy funnels through the executor's
+`dispatch` or `post`. I/O completions, combinator child launches,
+cancel callbacks, and cross-executor trampolines all converge on
+these two operations. The executor interface is the narrowest
+bottleneck in the library.
+
+### The Allocation Problem
+
+With `std::coroutine_handle<>` as the parameter, executors that queue
+work must allocate a node to hold the handle. The thread pool wraps
+every posted handle in a heap-allocated `work` struct:
+
+```cpp
+struct work : intrusive_queue<work>::node
+{
+    std::coroutine_handle<> h_;
+    // ...
+};
+
+void post(std::coroutine_handle<> h) {
+    auto* w = new work(h);   // per-post allocation
+    q_.push(w);
+}
+```
+
+Corosio's reactor scheduler has the same pattern: a `post_handler`
+that inherits from `scheduler_op` and is heap-allocated for every
+`post(coroutine_handle<>)` call. Corosio solved this for I/O
+operations by using `scheduler_op*` (an intrusive node embedded in
+the awaitable), but the executor-level `post(coroutine_handle<>)`
+path remained allocating.
+
+Frame allocation is already recycled via `recycling_memory_resource`.
+Queue-node allocation is the last steady-state allocation in the hot
+path.
+
+### The Safety Problem
+
+Users can obtain a `std::coroutine_handle<>` and call
+`executor.post(h)` directly. Misuse of raw coroutine handles
+(double resume, use-after-destroy, resuming on the wrong thread)
+causes silent UB. The type system does nothing to prevent it.
+
+### Concepts vs. Concrete Types
+
+A concept specifies the least set of requirements that generic code
+may rely on. The executor concept determines what `dispatch` and
+`post` accept. Changing this parameter type is a breaking change to
+the concept and all conforming executor implementations.
+
+## The Parameter Type Question
+
+Three options exist for how `dispatch` and `post` receive the
+continuation:
+
+### Option P1: By Value
+
+```cpp
+void post(continuation c) const;
+```
+
+**Arguments for:**
+
+1. Simplest signature. Matches the original sketch.
+2. No aliasing concerns — the executor gets its own copy.
+
+**Arguments against:**
+
+1. Breaks zero-allocation queuing. The executor links the
+   continuation into an intrusive queue via `next`. If `c` is a
+   stack-local copy, the copy is destroyed when `post` returns and
+   the queue has a dangling pointer. The whole point of the intrusive
+   `next` is that the executor queues the *original object*, not a
+   copy.
+2. For `dispatch`, the inline case (return `c.h` for symmetric
+   transfer) works, but the fallback to `post` has the same problem.
+
+### Option P2: By Reference (chosen)
+
+```cpp
+void post(continuation& c) const;
+std::coroutine_handle<> dispatch(continuation& c) const;
+```
+
+**Arguments for:**
+
+1. The executor links the original object into the queue. No copy,
+   no dangling pointer.
+2. The caller guarantees address stability — the `continuation`
+   must outlive the queue residency. This is the same guarantee
+   already required for coroutine frames and awaitable objects.
+3. Cleaner than pointer — no null state to handle.
+
+**Arguments against:**
+
+1. Requires the caller to ensure the `continuation` is an lvalue
+   with sufficient lifetime. A `continuation` constructed as a
+   temporary cannot be passed.
+
+### Option P3: By Pointer
+
+```cpp
+void post(continuation* c) const;
+```
+
+**Arguments for:**
+
+1. Traditional for intrusive data structures. Nullable.
+
+**Arguments against:**
+
+1. Nullable without reason — a null continuation is meaningless for
+   `post` and `dispatch`.
+2. Pointer syntax at every call site (`&c` vs. `c`).
+
+**Recommendation:** Option P2. By-reference is the only option that
+supports zero-allocation intrusive queuing without introducing null
+states. The address-stability requirement is inherent to intrusive
+data structures and is already a property of the objects that embed
+continuations (awaitables, combinator state).
+
+## The Placement Question
+
+The `continuation` needs a stable address while it sits in an
+executor's queue. Two locations were considered:
+
+### Option L1: In the Promise
+
+`io_awaitable_promise_base` gains a `continuation` member. One
+`continuation` per coroutine, reused across all suspension points.
+
+**Arguments for:**
+
+1. One canonical location per coroutine. No question about where
+   it lives.
+2. The promise outlives every suspension point, so the address is
+   always stable.
+3. `final_suspend` can dispatch the parent's continuation directly
+   without any additional state.
+
+**Arguments against:**
+
+1. Changes the `IoAwaitable` concept. `await_suspend` must receive
+   `continuation&` instead of `coroutine_handle<>`, or the awaitable
+   must reach into the caller's promise to get the continuation. Both
+   are protocol changes.
+2. Burdens task authors. Every promise type that inherits from
+   `io_awaitable_promise_base` grows by a pointer (the `next`
+   field) even though most suspension points never queue the
+   continuation (they use symmetric transfer inline).
+3. Conflates two concerns. The promise stores "who resumes me when
+   I'm done" — a parent-child relationship. The `continuation` with
+   `next` means "I'm a queueable unit of work." These are different
+   concepts. The parent's continuation is only queued when the child
+   finishes and the parent must be posted to a different executor.
+   In the common case (same executor, symmetric transfer), it is
+   never queued.
+
+### Option L2: In the Awaitable (chosen)
+
+Each I/O awaitable embeds its own `continuation`. The awaitable
+receives `coroutine_handle<>` in `await_suspend` as it does today,
+wraps it in the embedded `continuation`, and passes that to
+`post()`/`dispatch()`.
+
+**Arguments for:**
+
+1. No change to the `IoAwaitable` concept. The `continuation` is
+   an implementation detail of the awaitable, not a protocol concern.
+2. The awaitable has a stable address for the duration of the
+   suspension (the compiler guarantees this for the operand of
+   `co_await`).
+3. Aligns with corosio's pattern, where I/O services already embed
+   their operation state (`scheduler_op`) in the awaitable.
+4. Zero burden on task authors. `task<T>`, `quitter<T>`, and future
+   task types are unchanged.
+5. Cancel callbacks store `continuation*` pointing into the
+   awaitable, which outlives the suspension.
+
+**Arguments against:**
+
+1. A new `continuation` is initialized at every `co_await`. Not an
+   allocation (it is embedded), but `next` and `h` are set each
+   time.
+2. Combinator and trampoline patterns (parent dispatch, child
+   launch) do not have an I/O awaitable in scope. These sites need
+   their own `continuation` storage in the combinator state or
+   trampoline promise.
+
+### Comparison
+
+| Property | Promise (L1) | Awaitable (L2) |
+|---|---|---|
+| Changes `IoAwaitable` concept? | Yes | No |
+| Continuations per coroutine | One, reused | One per `co_await` |
+| Init cost per suspension | None (already set) | Set `h` and `next` |
+| Alignment with corosio `scheduler_op` | Separate patterns | Same pattern |
+| Burden on task authors | Yes — inherits extra pointer | None |
+| Combinator / trampoline sites | Free (in promise) | Need explicit storage |
+| `io_awaitable_promise_base` size | +8 bytes per coroutine | Unchanged |
+
+**Recommendation:** Option L2. The `continuation` is about how an
+I/O operation interacts with the executor's queue — that is the
+awaitable's concern. The handful of combinator and trampoline sites
+that need their own `continuation` storage are internal to the library
+and explicitly annotated. The promise base stays lean, the IoAwaitable
+protocol is untouched, and task authors see no change.
+
+## The Dispatch Return Type Question
+
+`dispatch` returns `std::coroutine_handle<>` for symmetric transfer.
+Two options exist for what `dispatch` returns now that it accepts
+`continuation&`:
+
+### Option D1: Return `std::coroutine_handle<>` (chosen)
+
+```cpp
+std::coroutine_handle<> dispatch(continuation& c) const;
+```
+
+**Arguments for:**
+
+1. Symmetric transfer is a language-level mechanism. `await_suspend`
+   must return `std::coroutine_handle<>`. The return type of
+   `dispatch` feeds directly into `await_suspend`'s return value.
+2. The inline case returns `c.h` (the wrapped handle). The posted
+   case returns `std::noop_coroutine()`. Both are already
+   `coroutine_handle<>`.
+3. No new type needed in the return position.
+
+**Arguments against:**
+
+None identified.
+
+### Option D2: Return `continuation&` or `continuation*`
+
+**Arguments for:**
+
+1. Symmetry with the parameter type.
+
+**Arguments against:**
+
+1. `await_suspend` cannot return `continuation&`. The language
+   requires `coroutine_handle<>`, `bool`, or `void`.
+2. The caller would have to unwrap `.h` at every return site.
+3. Returning a reference to the input parameter is semantically
+   confusing — the executor may have queued the continuation and
+   returned `noop_coroutine()`, in which case the reference points
+   to a queued object.
+
+**Recommendation:** Option D1. The return type stays
+`std::coroutine_handle<>`. Symmetric transfer is a language
+mechanism that operates on handles, not continuations.
+
+## The Address Stability Invariant
+
+A `continuation` must not move or be destroyed while it is linked
+into an executor's queue. When `post(c)` is called, the executor
+stores `&c` in an intrusive list via `c.next_`. If `c` moves or is
+destroyed before the executor dequeues it, the list has a dangling
+pointer.
+
+This is not a new class of obligation. A `coroutine_handle<>` posted
+to an executor has the same requirement: the coroutine frame it
+points to must remain alive until the handle is resumed. The
+difference is that the old executor interface hid this behind a
+per-post heap allocation — `new work(h)` copied the handle into
+owned storage, so the caller never had to think about it. With
+`continuation&`, the queue node is the caller's object, making the
+lifetime discipline explicit rather than hidden behind an allocation.
+
+In coroutine code, the invariant is satisfied automatically:
+
+- **I/O awaitables** are alive for the duration of the suspension
+  (guaranteed by the compiler for the operand of `co_await`).
+- **Combinator state** outlives all child runners by construction.
+- **Trampoline promises** live inside heap-allocated coroutine frames.
+
+The invariant is only visible in non-coroutine code (tests, manual
+executor interaction), where the caller must ensure the `continuation`
+is declared before the executor or otherwise outlives the queue
+residency. This is the same care required when holding a raw
+`coroutine_handle<>` — the handle must not dangle. The continuation
+merely surfaces an obligation that was always present.
+
+Practical guidelines:
+
+- **Do not store continuations in containers that reallocate.**
+  `std::vector<continuation>` is unsafe if the vector grows after
+  any continuation has been posted. Use
+  `std::unique_ptr<continuation[]>` (allocated once, never
+  reallocated) or `std::array<continuation, N>`.
+
+- **Declaration order matters in non-coroutine code.** A
+  stack-local `continuation` posted to a `thread_pool` must be
+  declared before the pool, so that C++ LIFO destruction destroys
+  the pool (joining its threads) before destroying the continuation.
+
+ASAN builds catch most violations.
+
+## The Strand Question
+
+The strand wraps an inner executor and provides serialized execution.
+Its internal mechanism uses `strand_op` wrapper coroutines with frame
+recycling. Two options exist for how the strand interacts with
+`continuation`:
+
+### Option S1: Strand Queues Continuations Directly
+
+Replace `strand_op` with direct `continuation` queueing via `next`.
+
+**Arguments for:**
+
+1. Eliminates the wrapper coroutine and frame recycling machinery.
+2. Consistent with the thread pool's approach.
+
+**Arguments against:**
+
+1. The strand_op wrapper exists for dispatch-loop control, not just
+   queuing. When the strand resumes a coroutine, the coroutine may
+   complete and its `final_suspend` may do symmetric transfer. The
+   wrapper coroutine catches this: it calls `target.resume()`, and
+   when the target suspends or the wrapper's own `final_suspend`
+   fires, control returns to the dispatch loop. Without the wrapper,
+   symmetric transfer from the target's `final_suspend` would escape
+   the strand's dispatch loop entirely.
+2. Frame recycling amortizes allocation to once per strand lifetime.
+   Removing it does not save allocations — it moves them.
+
+### Option S2: Strand Keeps Its Wrapper, Changes Input Signature (chosen)
+
+The strand's `post(continuation& c)` extracts `c.h` and wraps it in
+a `strand_op` as before. Only the public signature changes.
+
+**Arguments for:**
+
+1. Minimal change. The strand's proven serialization mechanism is
+   untouched.
+2. The `strand_op` wrapper and frame recycling continue to work
+   exactly as before.
+
+**Arguments against:**
+
+1. The strand does not benefit from zero-allocation posting. Each
+   `post` still creates a wrapper coroutine. (But the wrapper frames
+   are recycled, so the steady-state allocation count is zero.)
+
+**Recommendation:** Option S2. The strand's wrapper mechanism solves
+a problem (`continuation` does not: dispatch-loop control). Changing
+only the input signature is the minimal, safe approach.
+
+## The Promise Base Question
+
+`io_awaitable_promise_base` stores the parent's coroutine handle via
+`set_continuation(coroutine_handle<>)` / `continuation()`. Should
+this internal storage change from `coroutine_handle<>` to the
+`continuation` struct?
+
+### Option B1: Change Internal Storage
+
+`cont_` becomes `continuation`. `set_continuation` still accepts
+`coroutine_handle<>` and constructs the struct internally. Task
+authors see no change.
+
+**Arguments for:**
+
+1. `final_suspend` can dispatch the parent's continuation directly
+   to the executor without extra state.
+2. Invisible to task authors — the conversion is internal.
+
+**Arguments against:**
+
+1. Every coroutine frame grows by 8 bytes (the `next` pointer),
+   even though the parent's continuation is rarely queued. The common
+   case (same executor, symmetric transfer) returns `c.h` inline —
+   `next` is dead weight.
+2. Conflates "who resumes me" with "I'm a queueable unit."
+
+### Option B2: Keep Promise Base Unchanged (chosen)
+
+`cont_` stays as `coroutine_handle<>`. Only the specific internal
+types that dispatch through an executor at `final_suspend` store
+their own `continuation`:
+
+- `when_all_core::continuation_` (parent handle for combinator)
+- `when_any_core::continuation_` (same)
+- `boundary_trampoline::parent_` (cross-executor trampoline)
+- `run_awaitable_ex::task_cont_` (initial task dispatch)
+- `run_async_trampoline::task_cont_` (same)
+
+**Arguments for:**
+
+1. Zero size increase for all coroutine frames.
+2. Clean separation: the promise stores a handle for symmetric
+   transfer; the `continuation` struct is only used where queuing
+   actually occurs.
+3. The affected sites are all library-internal, not user-facing.
+
+**Arguments against:**
+
+1. More explicit storage declarations in combinator and trampoline
+   code. (But these are few and clearly annotated.)
+
+**Recommendation:** Option B2. The 8-byte-per-frame cost is
+unnecessary. The handful of internal sites that need a `continuation`
+for executor dispatch are explicit about it.
+
+## Impact on Corosio
+
+Corosio is a separate library that consumes capy's executor interface.
+The `continuation` change requires updates in corosio:
+
+1. **`io_context::executor_type`** — `dispatch` and `post` signatures
+   change. The fast-path logic (return `c.h` if on scheduler thread,
+   else post) is structurally identical.
+
+2. **`dispatch_coro`** — The single dispatch point for all
+   reactor-based I/O completions. Currently takes `coroutine_handle<>`
+   from the reactor_op; will take `continuation&`. The fast-path
+   (`target<io_context::executor_type>()` check) extracts `c.h` for
+   symmetric transfer.
+
+3. **`scheduler::post(coroutine_handle<>)`** — Currently
+   heap-allocates a `post_handler`. With `continuation`, the scheduler
+   can queue the continuation directly via `next`, eliminating the
+   allocation. Whether `continuation::next_` and `scheduler_op`'s
+   intrusive queue unify or coexist is a corosio-internal design
+   question.
+
+4. **I/O operation types** (`reactor_op`, `overlapped_op`,
+   `waiter_node`) — These store `coroutine_handle<>` and
+   `executor_ref`. They would embed a `continuation` instead.
+
+5. **IOCP constraint** — `overlapped_op` must remain an `OVERLAPPED`
+   for the Windows API. `continuation` must coexist with `OVERLAPPED`
+   inheritance, not replace it.
+
+## Areas of Agreement
+
+1. **The executor interface should not traffic in raw
+   `coroutine_handle<>`.** The allocation cost and safety risk are
+   both real.
+
+2. **The `IoAwaitable` concept should not change.** Awaitables
+   receive `coroutine_handle<>` in `await_suspend` and manage the
+   `continuation` internally.
+
+3. **The promise base should not carry a `continuation`.** The
+   per-frame overhead is unjustified for a field that is rarely
+   used for queuing.
+
+4. **`dispatch` returns `std::coroutine_handle<>`.** Symmetric
+   transfer is a language mechanism.
+
+5. **Address stability is the caller's responsibility.** The
+   `continuation` must outlive the queue residency. This is
+   documented and enforced by ASAN.
+
+## Summary
+
+| Property | `coroutine_handle<>` (old) | `continuation&` (new) |
+|---|---|---|
+| Per-post allocation (thread_pool) | `new work(h)` every call | None (intrusive queue) |
+| Per-post allocation (strand) | `strand_op` wrapper (recycled) | Same (wrapper retained) |
+| Type safety | Raw handle, easy to misuse | Struct, harder to fabricate |
+| `IoAwaitable` concept | `await_suspend(handle, env)` | Unchanged |
+| Promise base | `coroutine_handle<>` | Unchanged |
+| Combinator state | `coroutine_handle<>` fields | `continuation` fields |
+| Symmetric transfer | `dispatch` returns handle | Same |
+| Lifetime invariant | Frame must outlive handle (hidden by allocation) | Same obligation, explicit (no allocation) |
+| Breaking change | — | Yes (executor concept) |
+
+The core trade-off is between the simplicity of raw handles (freely
+copyable, lifetime hidden behind per-post allocation) and the
+performance and safety benefits of intrusive continuations
+(zero-allocation posting, type system barrier against misuse). The
+lifetime discipline is not new — a `coroutine_handle<>` always
+required the frame to outlive the handle — but it becomes the
+caller's explicit responsibility instead of being absorbed by a
+heap allocation. In coroutine code, the existing lifetime guarantees
+of awaitables and combinator state satisfy this automatically.
diff --git a/doc/executor-transfer.md b/doc/executor-transfer.md
new file mode 100644
index 000000000..5ca69292a
--- /dev/null
+++ b/doc/executor-transfer.md
@@ -0,0 +1,348 @@
+# Adding `transfer_to` to the Executor Concept
+
+## The Problem
+
+When a coroutine crosses an executor boundary via `run`, the strand on either side of the boundary can get trapped. The bug appears in two directions:
+
+**Case 1: caller on strand, target is a different executor.**
+
+1. A coroutine is running on a strand
+2. It does `co_await run(ex)(f())` where `ex` is not the strand
+3. During `f()`, the strand still thinks it is running - no other queued coroutine can make progress
+
+**Case 2: caller on io_context, target is a strand.**
+
+1. A coroutine is running on an io_context
+2. It does `co_await run(strand)(f())`
+3. `f()` runs inside the strand's invoker. When `f()` completes, the trampoline dispatches the parent back to the io_context. The io_context inlines (same thread), so the parent runs inside the strand's `safe_resume` call. The strand is held until the parent suspends.
+
+Both cases have the same root cause: `dispatch` can inline across an executor boundary, and the symmetric transfer chain runs the entire sequence without returning to the strand's invoker loop.
+
+## Root Cause
+
+The call site is `safe_resume(h)` at `strand_queue.hpp:269`, inside `dispatch_batch`. Each queued item is wrapped in a `strand_op` coroutine whose body calls `safe_resume(target)` at line 132. `safe_resume` calls `h.resume()`, which runs the coroutine until something in the chain returns `void` or `noop_coroutine()` from `await_suspend`. Symmetric transfer does not unwind - it tail-jumps through coroutine handles without returning to the caller.
+
+**Case 1 trace** (caller on strand, target is io_context):
+
+```
+dispatch_batch: safe_resume(wrapper_h)              <- line 269
+  -> wrapper: safe_resume(caller_coro)              <- line 132
+    -> caller does co_await run(io_ctx)(f())
+    -> await_suspend: io_ctx.dispatch(task_cont)
+    -> io_ctx INLINES, returns task_cont.h
+    -> symmetric-transfer to f()
+    -> f() runs to completion
+    -> f()'s final_suspend symmetric-transfers to trampoline
+    -> trampoline: caller_ex.dispatch(parent)
+    -> caller_ex is strand, running_in_this_thread() is true, INLINES
+    -> symmetric-transfer to parent
+    -> parent runs                                  <- still inside safe_resume
+    -> parent suspends
+  <- safe_resume returns
+```
+
+**Case 2 trace** (caller on io_context, target is strand):
+
+```
+dispatch_batch: safe_resume(wrapper_h)              <- line 269
+  -> wrapper: safe_resume(inner_task)               <- line 132
+    -> f() runs on the strand, completes
+    -> f()'s final_suspend symmetric-transfers to trampoline
+    -> trampoline: caller_ex.dispatch(parent)
+    -> caller_ex is io_context, INLINES, returns parent.h
+    -> symmetric-transfer to parent
+    -> parent runs                                  <- still inside safe_resume
+    -> parent suspends
+  <- safe_resume returns
+```
+
+In both cases, the strand's invoker loop does not get control back until the parent suspends. The strand is held for the duration of the inner task, the trampoline, and the parent's resumed execution.
+
+## The Fix: `transfer_to`
+
+The problem is that `dispatch` does not give the source executor a chance to clean up before handing off to the target. We need a third verb on the Executor concept.
+
+The three executor verbs become:
+
+- `std::coroutine_handle<> dispatch(continuation& c)` - Run `c` on this executor. If already on the right thread, return `c.h` for symmetric transfer. Otherwise queue `c` and return `noop_coroutine()`.
+
+- `void post(continuation& c)` - Queue `c` on this executor. Never run inline.
+
+- `std::coroutine_handle<> transfer_to(executor_ref target, continuation& c)` - This executor is releasing `c`. Do whatever is needed to let go, then get `c` running on `target`. Non-serializing executors forward to `target.dispatch(c)`. Serializing executors (strands) post to `target` so the current dispatch batch can finish and the serialization frame can close normally.
+
+`transfer_to` is called on the source executor - the one being left - because the source is the one that knows whether it needs to break the symmetric transfer chain. The target is always a valid (non-null) `executor_ref`.
+
+## Concept Change
+
+`executor.hpp` requires clause gains:
+
+```cpp
+{ ce.transfer_to(executor_ref{}, c) } -> std::same_as<std::coroutine_handle<>>;
+```
+
+## Type Erasure
+
+The vtable in `executor_ref.hpp` gains a slot:
+
+```cpp
+std::coroutine_handle<> (*transfer_to)(void const*, executor_ref, continuation&);
+```
+
+`executor_ref` gains a forwarding method:
+
+```cpp
+std::coroutine_handle<> transfer_to(executor_ref target, continuation& c) const
+{
+    return vt_->transfer_to(ex_, target, c);
+}
+```
+
+The `vtable_for<Ex>` template gains a corresponding lambda:
+
+```cpp
+[](void const* p, executor_ref target, continuation& c) -> std::coroutine_handle<> {
+    return static_cast<Ex const*>(p)->transfer_to(target, c);
+},
+```
+
+## Per-Executor Implementation
+
+### thread_pool::executor_type
+
+The thread pool has no serialization state. `transfer_to` just forwards to the target:
+
+```cpp
+std::coroutine_handle<>
+transfer_to(executor_ref target, continuation& c) const
+{
+    return target.dispatch(c);
+}
+```
+
+The thread pool was never affected by the strand escape bug. Its `dispatch` already calls `post` and returns `noop_coroutine()`, so the symmetric transfer chain always breaks at the thread pool boundary.
+
+### strand
+
+The strand is why `transfer_to` exists. Its implementation posts to the target instead of dispatching:
+
+```cpp
+// strand.hpp
+std::coroutine_handle<>
+transfer_to(executor_ref target, continuation& c) const
+{
+    return detail::strand_service::transfer_to(
+        *impl_, executor_ref(ex_), target, c);
+}
+```
+
+```cpp
+// strand_service.cpp
+std::coroutine_handle<>
+strand_service::transfer_to(
+    strand_impl& impl, executor_ref inner_ex,
+    executor_ref target, continuation& c)
+{
+    target.post(c);
+    return std::noop_coroutine();
+}
+```
+
+This is deliberately minimal. The strand does not touch `dispatch_thread_` or `locked_`. It does not need to. Here is why:
+
+When `transfer_to` is called, we are inside the invoker's `dispatch_pending` call, deep in a `.resume()` on a batch item. The invoker loop looks like this:
+
+```cpp
+for(;;)
+{
+    set_dispatch_thread(*p);
+    dispatch_pending(*p);      // we are here
+    if(try_unlock(*p))
+    {
+        clear_dispatch_thread(*p);
+        co_return;
+    }
+}
+```
+
+`target.post(c)` queues the inner task on the target executor. Returning `noop_coroutine()` causes the coroutine to suspend, so `.resume()` returns and `dispatch_batch` moves to the next item. The batch finishes. The invoker loop reaches `try_unlock`, which either unlocks the strand (if the queue is empty) or loops to drain more work. Either way, the strand releases through its normal path.
+
+The inner task `f()` runs concurrently on the target executor. When it finishes, the trampoline dispatches the parent back to the strand through the normal enqueue path.
+
+An earlier version of this document proposed calling `clear_dispatch_thread` and `try_unlock` from inside `transfer_to`. That is wrong. Calling `try_unlock` mid-batch can set `locked_ = false` while the invoker is still processing items. If new work arrives and triggers a second invoker, two invokers run concurrently and the strand's serialization invariant breaks. The invoker loop is the only safe place to manipulate `locked_` and `dispatch_thread_`.
+
+### Asio-style io_context bridges
+
+User-written executor adapters (like `asio_executor` in the examples) have no serialization state. They get the same trivial implementation as thread_pool:
+
+```cpp
+std::coroutine_handle<>
+transfer_to(executor_ref target, continuation& c) const
+{
+    return target.dispatch(c);
+}
+```
+
+Any user-defined executor that implements its own serialization (an actor, an ordered queue, a custom strand-like primitive) should follow the strand pattern: post to the target instead of dispatching, so the current serialization frame can close normally.
+
+## Trampoline Change
+
+Both the forward trip and the return trip need `transfer_to`. The trampoline must store both executors:
+
+```cpp
+struct promise_type
+{
+    executor_ref caller_ex_;
+    executor_ref target_ex_;   // NEW
+    continuation parent_;
+};
+```
+
+**Forward trip** (`run_awaitable_ex::await_suspend`):
+
+Currently:
+
+```cpp
+task_cont_.h = h;
+return ex_.dispatch(task_cont_);
+```
+
+Becomes:
+
+```cpp
+task_cont_.h = h;
+return caller_env->executor.transfer_to(ex_, task_cont_);
+```
+
+This fixes Case 1. If the caller is on a strand, the strand posts to the target and returns `noop_coroutine()`. The coroutine suspends, the strand's batch finishes, and the invoker loop releases the strand normally.
+
+**Return trip** (trampoline `final_suspend`):
+
+Currently:
+
+```cpp
+return detail::symmetric_transfer(
+    p_->caller_ex_.dispatch(p_->parent_));
+```
+
+Becomes:
+
+```cpp
+return detail::symmetric_transfer(
+    p_->target_ex_.transfer_to(p_->caller_ex_, p_->parent_));
+```
+
+This fixes Case 2. If the target is a strand, the strand posts the parent to the caller's executor and returns `noop_coroutine()`. The trampoline suspends, control returns to the strand's `safe_resume` call, and the invoker loop proceeds to drain and unlock.
+
+If neither the caller nor the target is a strand, both `transfer_to` calls forward to `target.dispatch(c)`, preserving the inline fast path.
+
+## Alternative: Guard Object Instead of `transfer_to`
+
+The `transfer_to` design requires every launch function author to call `transfer_to` on both trips. If someone writes a custom `run`-like function and forgets the return trip, they reintroduce the bug. An RAII guard in the coroutine frame could make the fix automatic. Three approaches:
+
+### Option A: Flag in io_env
+
+The strand sets a flag in the `io_env` when it dispatches a coroutine. The trampoline checks the flag on both trips and posts instead of dispatching when it is set.
+
+`io_env` gains a bool:
+
+```cpp
+struct io_env
+{
+    executor_ref executor;
+    std::stop_token stop_token;
+    std::pmr::memory_resource* frame_allocator = nullptr;
+    bool serialized = false;    // NEW
+};
+```
+
+The strand's `dispatch` sets `serialized = true` in the env before returning the handle. The trampoline reads it:
+
+```cpp
+// forward trip (await_suspend)
+if(caller_env->serialized)
+{
+    ex_.post(task_cont_);
+    return std::noop_coroutine();
+}
+return ex_.dispatch(task_cont_);
+
+// return trip (final_suspend)
+if(/* target env was serialized */)
+{
+    caller_ex_.post(parent_);
+    return std::noop_coroutine();
+}
+return caller_ex_.dispatch(parent_);
+```
+
+**For:** Zero-cost for non-strand cases (one branch on a bool). No changes to the Executor concept. The strand marks it, the trampoline reads it, the user never touches it.
+
+**Against:** The env is `const` from the awaitable's perspective - the strand would need to set the flag before the env reaches the awaitable, which means the flag lives in the env owned by the `run` awaitable, not the caller's env. On the return trip, the trampoline needs to know whether the *target's* env was serialized, but the trampoline only stores the caller's executor, not the target's env. Plumbing the target's serialization flag to the trampoline adds complexity similar to storing `target_ex_`. The env also flows through the entire coroutine chain, so the flag would affect nested `run` calls - a coroutine on a strand that calls `run(strand2)(f())` inside `run(pool)(g())` would see `serialized = true` from the outer strand even though the inner context is a pool.
+
+### Option B: TLS set by the strand invoker
+
+The strand's invoker loop sets a TLS variable before calling `dispatch_pending` and clears it after. Any executor's `dispatch` checks this TLS variable and posts instead of inlining when it is set.
+
+```cpp
+// strand invoker loop
+inline thread_local strand_impl* current_strand = nullptr;
+
+static strand_invoker make_invoker(strand_impl& impl)
+{
+    strand_impl* p = &impl;
+    for(;;)
+    {
+        set_dispatch_thread(*p);
+        current_strand = p;
+        dispatch_pending(*p);
+        current_strand = nullptr;
+        if(try_unlock(*p))
+        {
+            clear_dispatch_thread(*p);
+            co_return;
+        }
+    }
+}
+```
+
+Every executor's `dispatch` checks:
+
+```cpp
+std::coroutine_handle<> dispatch(continuation& c) const
+{
+    if(detail::current_strand)
+    {
+        post(c);
+        return std::noop_coroutine();
+    }
+    // normal dispatch logic
+}
+```
+
+**For:** Fully automatic. No changes to `run`, trampolines, or any launch function. Every executor boundary crossing inside a strand batch posts instead of inlining. Covers both trips, all launch functions, and user-defined launch functions that don't know about `transfer_to`.
+
+**Against:** Every executor's `dispatch` pays a TLS read on every call, even when no strand is involved. Invasive - every concrete executor and `executor_ref::dispatch` must add the check. Nested strands need save/restore (strand B's invoker would clear the TLS set by strand A's invoker). The TLS approach also prevents legitimate inlining within a strand - if a coroutine on a strand dispatches more work to the same strand, it should inline (that is what `running_in_this_thread()` enables), but the TLS check would force it to post. Distinguishing "dispatching to a foreign executor" from "dispatching to the same strand" requires comparing the current strand pointer, adding more logic to every dispatch call.
+
+### Option C: Continuation wrapper
+
+The strand wraps every dispatched continuation in a guard that intercepts the symmetric transfer chain. When the coroutine suspends and `await_suspend` returns a handle that would leave the strand, the guard detects this and posts instead.
+
+**For:** In theory, fully automatic and encapsulated in the strand.
+
+**Against:** Not implementable with the current coroutine model. Symmetric transfer happens inside the C++ runtime - `await_suspend` returns a `std::coroutine_handle<>` and the runtime tail-calls it. There is no interception point between the return from `await_suspend` and the resumption of the target handle. The strand cannot inspect or redirect the handle after `await_suspend` returns it. The `strand_op` wrapper already wraps the target in a coroutine, but `safe_resume(target)` follows the entire symmetric transfer chain before returning - the wrapper only gets control back after the chain ends, which is too late.
+
+### Recommendation
+
+Option B (TLS) is the most automatic but too invasive and has the wrong default for same-strand dispatch. Option C is not implementable. Option A (env flag) has the right shape but the plumbing to get the flag to both trips is roughly as complex as storing `target_ex_` in the trampoline.
+
+`transfer_to` on the Executor concept remains the cleanest design. The cost is that launch function authors must call it on both trips. Since launch functions are library machinery (not user code), and capy ships the primary one (`run`), this is an acceptable constraint. The alternative is to provide `transfer_to` on the concept AND use it automatically inside `run`'s trampoline, so users who write `co_await run(ex)(f())` never think about it. Custom launch functions that want the same correctness call `transfer_to`; those that don't care about strands can use plain `dispatch`.
+
+## Test Impact
+
+`testRunExStrandFirstInstruction` verifies that `running_in_this_thread()` is true inside an inner task passed to `run(strand)`. With `transfer_to` on the forward trip, the caller's executor (pool) calls `transfer_to` which forwards to `strand.dispatch(c)`. The strand is not running on this thread, so it enqueues and posts an invoker. The inner task runs inside the invoker where `running_in_this_thread()` is true. The test should still pass.
+
+Two new tests are needed:
+
+- **Case 1 regression test:** Two coroutines on the same strand. One does `co_await run(pool_ex)(slow_task())`. The second coroutine should make progress while `slow_task` runs on the pool.
+
+- **Case 2 regression test:** A coroutine on an io_context does `co_await run(strand)(f())`. After `f()` completes and the parent resumes, verify the strand is free (not held by the parent's execution).
diff --git a/doc/io-benchmark-analysis.md b/doc/io-benchmark-analysis.md
new file mode 100644
index 000000000..018331591
--- /dev/null
+++ b/doc/io-benchmark-analysis.md
@@ -0,0 +1,322 @@
+
+# I/O Read Stream Benchmark Analysis
+
+## Overview
+
+This benchmark compares three execution models for asynchronous I/O across three stream abstraction levels and two I/O return types. Each cell executes 20,000,000 `read_some` calls on a single thread using a no-op stream, isolating execution model overhead from I/O latency. Each configuration is measured over 5 independent runs preceded by a warmup pass; tables report mean +/- standard deviation. The benchmark source is available at [14].
+
+## Trade-off summary
+
+| Property                          | capy IoAwaitable                      | P2300 sender/receiver                        |
+|-----------------------------------|---------------------------------------|----------------------------------------------|
+| Native concrete performance       | ~31 ns/op, 0 al/op                   | ~32–34 ns/op, 0 al/op                        |
+| Type erasure cost (with recycler) | +5 ns/op, 0 al/op                    | +21–23 ns/op, 1 al/op (conditional on SBO fit [18]) |
+| Type erasure mechanism            | Preallocated awaitable                | Recycled op_state (factory + virtual dispatch)|
+| Why the gap persists              | No allocator path, no allocation call | Allocator fast path + factory + unique_ptr [3]|
+| Synchronous completion            | ~1 ns/op (symmetric transfer)         | ~2.6 ns/op (trampoline [19]); ~1 ns in coroutine via `as_awaitable` [15] |
+| Inline completion (await_ready)   | I/O in `await_ready`, no suspend      | No equivalent; `start()` is void and post-suspend [16] |
+| Looping                           | Native `for` loop                     | `repeat_until` with trampoline [19]           |
+| Bridge to other model (native)    | ~10–11 ns/op, 1 al/op                | ~16 ns/op, 0 al/op                          |
+| Bridge to other model (erased)    | Faster in bex::task, equal in pipeline| ~32 ns/op, 0 al/op                          |
+| Sender → awaitable bridge         | Zero-alloc synthetic frame (`frame_cb`) [10] | `as_awaitable` customization point [2] |
+| Awaitable → sender bridge         | No customization point; `connect-awaitable` uses coroutine (manual HALO in stdexec [17]) [3] | N/A (native) |
+| `as_awaitable` bypass             | N/A (native protocol)                 | Only leaf senders with explicit member [7, 15] |
+| Compile-time env safety           | Structural (in function signature)    | Opt-in (per-sender constraint) [11, 12]      |
+| Composability                     | Coroutine chains (`when_all`, `when_any`, `timeout`) | Sender algorithm pipelines  |
+
+
+## Results
+
+All values are mean +/- stddev over 5 runs (warmup excluded). **Bold** = native execution model (Column A). al/op counts allocation calls per operation, including recycled allocations. The bridge column (B) in Tables 1 and 3 shows 1 al/op — the `scheduled_resume` operation state when IoAwaitables post through `schedule()` → `connect()` → `start()`.
+
+### Table 1: sender/receiver pipeline
+
+|                | A: sender (native)   |          | B: awaitable (bridge) |          |
+|----------------|---------------------:|---------:|----------------------:|---------:|
+|                | ns/op                | al/op    | ns/op                 | al/op    |
+| Native         | **34.3 +/- 0.1**    | **0**    | 46.3 +/- 0.0         | 1        |
+| Abstract       | **47.1 +/- 0.2**    | **1**    | 46.4 +/- 0.0         | 1        |
+| Type-erased    | **57.5 +/- 0.0**    | **1**    | 54.1 +/- 0.1         | 1        |
+| Synchronous    | **2.6 +/- 0.3**     | **0**    | 5.1 +/- 0.1          | 0        |
+
+### Table 2: capy::task
+
+|                | A: awaitable (native)  |          | B: sender (bridge) |          |
+|----------------|-----------------------:|---------:|--------------------:|---------:|
+|                | ns/op                  | al/op    | ns/op               | al/op    |
+| Native         | **31.4 +/- 0.2**      | **0**    | 48.1 +/- 0.3       | 0        |
+| Abstract       | **32.3 +/- 0.2**      | **0**    | 72.2 +/- 0.2       | 1        |
+| Type-erased    | **36.4 +/- 0.1**      | **0**    | 72.1 +/- 0.0       | 1        |
+| Synchronous    | **1.0 +/- 0.2**       | **0**    | 19.0 +/- 0.0       | 0        |
+
+### Table 3: beman::execution::task
+
+|                | A: sender (native)   |          | B: awaitable (bridge) |          |
+|----------------|---------------------:|---------:|----------------------:|---------:|
+|                | ns/op                | al/op    | ns/op                 | al/op    |
+| Native         | **31.9 +/- 0.0**    | **0**    | 43.5 +/- 0.1         | 1        |
+| Abstract       | **55.2 +/- 0.0**    | **1**    | 43.4 +/- 0.0         | 1        |
+| Type-erased    | **55.2 +/- 0.0**    | **1**    | 48.7 +/- 0.1         | 1        |
+| Synchronous    | **1.0 +/- 0.2**     | **0**    | 2.9 +/- 0.2          | 0        |
+
+## Analysis
+
+### Native performance is equivalent
+
+Both execution models achieve ~31–34 ns/op with zero allocations when consuming their native I/O type on a concrete stream. The sender pipeline's native result (34.3 ns/op) is ~2–3 ns higher than the coroutine models (~31–32 ns/op) due to the `trampoline_scheduler` interposed by `repeat_until` on every iteration [19] — even when operations complete asynchronously, the trampoline checks recursion depth and stack consumption before inlining. This overhead is the cost of stack overflow protection in the pure sender path.
+
+### Type erasure costs diverge
+
+- **capy::any_read_stream** (type-erased awaitable): **36.4 ns/op, 0 al/op**. The awaitable is preallocated at stream construction and reused across every `read_some` call. No allocator path is invoked per operation — placement construct into existing storage.
+
+- **sndr_any_read_stream** (type-erased sender): **55.2–57.5 ns/op, 1 al/op**. Each operation traverses the recycling allocator fast path (TLS lookup, size-class bucketing, free-list pop/push), the factory lambda, `concrete_op` construction/destruction, virtual `start()`/`execute()` dispatch, and `unique_ptr` management.
+
+The ~19–21 ns gap and the 1 al/op difference are irreducible with the current sender/receiver architecture. The allocation call represents the minimum structural cost of the `connect`/`start` protocol under type erasure: the operation state's type is erased, so it must be dynamically allocated — even with a recycling allocator.
+
+**stdexec note:** stdexec's `any_sender` uses a 64-byte small buffer optimization (SBO) for type-erased operation states [18]. If a concrete operation state fits within this buffer, no heap allocation occurs — the state is constructed in-place. Whether the 1 al/op manifests depends on the operation state size. This benchmark's type-erased senders produce operation states that exceed the SBO threshold, but simpler senders may avoid the allocation entirely. The structural cost is therefore conditional on operation state size, not inherent to the protocol.
+
+**libunifex note:** libunifex's `any_sender_of` uses `any_unique_t`, which always heap-allocates the type-erased operation state with no SBO [22]. Every `connect` on a type-erased sender allocates regardless of operation state size, confirming that the structural allocation cost is inherent to the `connect`/`start` protocol when the operation state type is erased.
+
+The allocation counts (from the native Column A):
+
+| Stream type           | pipeline | capy::task | bex::task |
+|-----------------------|---------:|-----------:|----------:|
+| Native                | 0        | 0          | 0         |
+| Abstract              | 1        | 0          | 1         |
+| Type-erased           | 1        | 0          | 1         |
+
+The IoAwaitable column (capy::task) shows 0 al/op at all abstraction levels. The sender columns show 1 al/op once the stream is abstracted — the type-erased `concrete_op` allocation that the recycler serves from its free list.
+
+### Bridges are competitive
+
+The non-bold column in each table measures the cost of consuming the opposite I/O type through a bridge. Both bridges use universally correct protocols — not optimized for this benchmark's specific senders.
+
+- **await_sender** (sender → IoAwaitable, Table 2 Col B): Adds ~17 ns and **zero allocations** for native senders. The bridge connects the sender to a bridge receiver and uses an atomic exchange protocol to handle synchronous and asynchronous completion uniformly. The receiver resumes the coroutine directly — no posting through the executor. Abstract and type-erased senders show 1 al/op — the type-erased `concrete_op` allocation from the sender side, not the bridge.
+
+- **as_sender** (IoAwaitable → sender, Tables 1 and 3 Col B): For `beman::execution::task` (Table 3), the `awaitable_sender`'s `as_awaitable` member lets beman's `await_transform` [2, §33.9.11.8] call the IoAwaitable directly, bypassing beman's `sender_awaitable` wrapping. The overhead is ~11 ns over the native sender path. For the sender pipeline (Table 1), the bridge constructs a synthetic coroutine frame (`frame_cb`). Both paths incur 1 al/op from the `scheduled_resume` operation state — the P2300-mandated `schedule()` → `connect()` → `start()` protocol to resume a coroutine on the scheduler.
+
+### The bridged awaitable outperforms native senders under abstraction
+
+In Table 3 (`beman::execution::task`), the bridged awaitable column (Col B) is **faster** than the native sender column (Col A) for abstract and type-erased streams:
+
+- Table 3 abstract: awaitable bridge 43.4 ns (1 al/op) vs sender native **55.2 ns (1 al/op)**
+- Table 3 type-erased: awaitable bridge 48.7 ns (1 al/op) vs sender native **55.2 ns (1 al/op)**
+
+Both sides now show 1 al/op at the abstract/type-erased level, but the awaitable bridge is still 7–12 ns faster. The bridged awaitable's performance is remarkably flat across abstraction levels (43.5/43.4/48.7 ns), while the native sender jumps sharply from 31.9 ns (native) to 55.2 ns (abstract). This occurs because the bridge cost is constant — the IoAwaitable's `await_suspend` always follows the same path regardless of stream abstraction — while the sender model's virtual dispatch and type erasure machinery scale with abstraction level.
+
+In Table 1 (sender pipeline), the bridge is slightly faster: bridge 54.1 ns vs native 57.5 ns at the type-erased level — both at 1 al/op.
+
+### P2300 bridge asymmetry
+
+P2300 provides asymmetric support for bridging between senders and awaitables [2, 3]:
+
+**Sender → Awaitable:** The `as_awaitable` customization point [2, §33.9.11.8] is the first-priority dispatch when a sender is `co_await`'d. A sender can provide an optimized awaitable representation via a member function, completely bypassing the generic `sender_awaitable` wrapping (connect + start + result variant + atomic). The benchmark's sender streams use this to provide an awaitable that inherits `work_item` and enqueues itself directly — single round-trip, zero allocation. This is a legitimate and expected customization [7].
+
+**Awaitable → Sender:** There is no equivalent customization point on the awaitable side. When `connect()` encounters an awaitable, it uses `connect-awaitable` [17], which creates a bridge coroutine. P2006R1 explicitly notes this frame is "not generally eligible for the heap-allocation elision optimization (HALO)" [3]. stdexec mitigates the heap allocation by pre-allocating 64 bytes of storage inline in the operation state and overriding the coroutine's `operator new` to placement-construct into this buffer [17] — a manual HALO that avoids heap allocation when the coroutine frame fits. libunifex's `connect_awaitable` also uses a bridge coroutine for the same purpose but without the inline-storage optimization [22]. Capy's `as_sender` bridge avoids the coroutine frame entirely by using a synthetic `frame_cb`. P4126R0 [10] proposes standardizing this technique.
+
+### stdexec's symmetric transfer recovery in `as_awaitable`
+
+When a sender without an `as_awaitable` member is `co_await`'d inside a coroutine, stdexec wraps it in `__sender_awaitable` [15]. This wrapper recovers symmetric transfer for the sender protocol using an atomic compare-and-swap race detection mechanism:
+
+1. `await_suspend` sets an atomic `__ready_` flag to `false`, then calls `start()` on the operation state.
+2. If the sender completes inline (during `start()`), the receiver's completion handler attempts a CAS on `__ready_` from `false` to `true`. If `await_suspend` hasn't checked yet, the CAS succeeds and the receiver defers resumption to `await_suspend`.
+3. Back in `await_suspend`, a second CAS detects that `__ready_` is already `true` and returns the current coroutine handle — achieving symmetric transfer with a flat stack.
+4. If the sender completed asynchronously, the CAS finds `__ready_` still `false`, sets it to `true`, and returns `noop_coroutine()` to suspend. The receiver resumes the continuation later.
+
+An additional thread ID check short-circuits the atomic protocol: if completion occurs on a different thread, it is definitionally asynchronous and the receiver resumes directly.
+
+When the sender is statically known to complete inline (via the `__completes_inline` concept [21]), stdexec uses a specialized code path that skips the atomic entirely — `await_suspend` calls `connect` and `start` synchronously and returns the coroutine handle directly [15].
+
+This mechanism recovers symmetric transfer **only when a sender is `co_await`'d inside a coroutine**. It does not help the pure sender/receiver pipeline path (Table 1), where no coroutine exists to provide `await_suspend`.
+
+**libunifex note:** libunifex's sender-to-awaitable bridge does not implement this atomic exchange protocol. Its `_as_awaitable::await_suspend` calls `start()` and returns `void` — the coroutine always suspends unconditionally [22]. If the sender completes synchronously during `start()`, the receiver's `complete()` method directly calls `continuation_.resume()`, resuming the coroutine from within the `start()` call stack. This risks stack buildup on repeated synchronous completions and demonstrates that the recovery mechanism is an implementation-specific optimization, not a structural property of the sender model.
+
+### Table 3 and `as_awaitable`
+
+Table 3's native sender column (Col A) benefits from `bex::task`'s `as_awaitable` dispatch. When a sender provides an `as_awaitable` member, `bex::task`'s `await_transform` calls it directly — the sender's `connect` and `start` methods are never invoked. stdexec's implementation confirms this dispatch priority: the `as_awaitable` CPO uses a `__first_callable` chain that checks the sender's member function first, before falling back to generic wrapping [15]. The benchmark's `sndr_read_stream::read_sender` provides exactly this: an `as_awaitable` that returns a lightweight `work_item` awaitable, identical in cost to the IoAwaitable path.
+
+This explains why Table 3 native sender (31.9 ns/op) matches Table 2 native awaitable (31.4 ns/op) — both are measuring the awaitable path, not the sender protocol.
+
+The existing P2300 networking implementation in beman::net [13] does not use `bex::task`. Its examples use a custom `demo::task` whose `await_transform` always creates a `sender_awaiter` that calls `connect` + `start` — with no `as_awaitable` check. Every `co_await net::async_receive(...)` in beman::net pays the full sender protocol cost. For beman::net users, Table 1 (sender pipeline) is more representative of actual per-operation overhead than Table 3.
+
+Senders that do not provide `as_awaitable` — which includes most senders produced by P2300 algorithms like `let_value`, `then`, `when_all`, etc. — also go through the full `connect`/`start` path in `bex::task` via its generic `sender_awaitable` bridge. In stdexec, only `STDEXEC::task` and `exec::basic_task` define `as_awaitable` members [15]; no algorithm sender does. The `as_awaitable` optimization is only available to leaf senders that implement it explicitly.
+
+### Compile-time safety
+
+The IoAwaitable protocol's 2-argument `await_suspend(coroutine_handle<>, io_env const*)` structurally enforces that the execution environment is provided at suspension time. The dependency is in the function signature — the compiler rejects any call site that does not provide it.
+
+In the sender/receiver model, environment availability is checked when a sender queries the receiver's environment inside `start()`. This check IS compile-time (it fails template instantiation if the query is unsupported), but it is opt-in: each sender must explicitly constrain its `connect` method. If the sender author forgets the constraint, the error appears as a deep template instantiation failure rather than a clear signature mismatch. P3164R4 [11] and P3557R2 [12] are addressing diagnostic quality for these errors but are not yet part of the C++26 standard.
+
+### Sender looping and the trampoline
+
+The sender/receiver pipeline (Table 1) uses stdexec's `repeat_until` [19] composed with `let_value` and `just` to implement a loop. The `repeat_until` algorithm wraps each iteration with a `trampoline_scheduler` that tracks recursion depth (default 16) and stack consumption (default 4096 bytes). When either limit is exceeded, execution is deferred to a queue and drained iteratively [19]. This prevents stack overflow from repeated inline completions — enabling the Synchronous row in Table 1.
+
+The trampoline adds a small but measurable overhead to every iteration: the native pipeline (34.3 ns/op) is ~2–3 ns slower than the coroutine models (~31–32 ns/op) even for asynchronous operations, because the trampoline checks are executed unconditionally. This is the baseline cost of stack overflow protection in the pure sender path.
+
+At the native level, the pipeline (34.3 ns/op) remains comparable to the coroutine models. The gap widens under type erasure because the pipeline's `connect` on each iteration traverses the factory + allocator path (1 al/op), whereas a coroutine reuses its frame (0 al/op).
+
+**libunifex note:** libunifex provides both `repeat_effect_until` (with the same direct-recursion design and no built-in trampoline) and a separate `trampoline_scheduler` that tracks recursion depth (default 16) and defers to an iterative drain queue when the limit is exceeded [22]. The trampoline is not integrated into the repeat algorithm — users must compose them explicitly. This confirms the pattern: the trampoline is a general-purpose mitigation, not a solved problem within sender loop algorithms.
+
+### Synchronous completions
+
+In real networking I/O, many operations complete without waiting for the kernel: reads from a socket with data already in the receive buffer, writes to a non-full send buffer, DNS cache hits, TLS session resumptions, io_uring completions already batched in the completion queue. In a high-throughput server, this is the common case — a busy connection often has data waiting before the application reads it.
+
+The Synchronous row measures this scenario. The I/O operation completes immediately — no executor posting, no thread pool round-trip.
+
+|                | capy::task (awaitable) | beman::task (sender via as_awaitable) | sender pipeline (trampoline) |
+|----------------|:----------------------:|:-------------------------------------:|:----------------------------:|
+| Synchronous    | 1.0 ns/op              | 1.0 ns/op                             | 2.6 ns/op                    |
+
+Both coroutine models achieve ~1 ns/op through symmetric transfer — `await_suspend` returns the coroutine handle and the compiler performs a tail call. The stack stays flat regardless of how many operations complete synchronously in sequence.
+
+The sender/receiver pipeline achieves 2.6 ns/op using stdexec's `repeat_until` with `trampoline_scheduler` [19]. The trampoline detects inline completions and defers to an iterative queue when recursion limits are reached, keeping the stack bounded. This is 2.6x slower than the coroutine path — the overhead comes from the trampoline's recursion depth and stack consumption checks on every iteration, plus the occasional queue drain when limits are exceeded. Without the trampoline, repeated inline completions would cause stack overflow because `start()` is void [16] — the only way to deliver a result is through the receiver (`set_value`), which recurses into the next iteration's `connect`/`start`.
+
+Coroutines handle synchronous completions more efficiently through two mechanisms, neither of which has a sender equivalent:
+
+- **`await_ready`** — The awaitable can perform the I/O (e.g., `recvmsg`) in `await_ready` and return `true` if data is available. The coroutine never suspends — no handle manipulation, no symmetric transfer, no atomic exchange. This is the fastest possible path for inline completions. A sender cannot do this because `start()` is called inside `await_suspend`, after the coroutine has already suspended. The work cannot be moved earlier. (Both stdexec's `__sender_awaitable` [15] and libunifex's `_as_awaitable` [22] unconditionally return `false` from `await_ready`, confirming across implementations that senders cannot use this optimization even when wrapped.)
+
+- **`await_suspend` return value** — If `await_ready` returns `false`, `await_suspend` can still complete the I/O and return the coroutine handle for symmetric transfer. The compiler performs a tail call — the stack stays flat regardless of how many operations complete synchronously in sequence. (stdexec recovers this mechanism for senders via `__sender_awaitable`'s atomic CAS protocol [15] — see *stdexec's symmetric transfer recovery* above — but only when the sender is consumed inside a coroutine.)
+
+The sender model would need an equivalent mechanism (a way for `start()` to indicate "completed synchronously, here's the result"), which does not exist in P2300 and would be a fundamental change to the operation state protocol. stdexec's `__completion_behavior` system [21] can statically determine whether a sender completes inline, enabling optimized code paths at compile time, but this is a static property used by wrappers — it does not change the `start()` return type.
+
+### What the bridge columns demonstrate
+
+The bridged columns represent the real cost that arises when a library returns one I/O type but the application uses the other execution model. A networking library built on IoAwaitables will pay the `as_sender` tax when consumed from a sender pipeline. Conversely, a sender-based I/O library will pay the `await_sender` tax when consumed from `capy::task`.
+
+Both bridges are designed for universal correctness:
+
+- **await_sender** uses an atomic exchange protocol that safely handles senders completing synchronously during `start()`, asynchronously on the same thread, or asynchronously on a different thread.
+
+- **as_sender** uses the P2300 environment query mechanism [2, §33.9.4] to obtain its executor, provides an `as_awaitable` member for coroutine integration [2, §33.9.11.8], and provides a `connect` path for sender pipelines — each using the most efficient mechanism available for that context.
+
+The bridge overhead is modest — both directions add 11–17 ns for native streams. The `await_sender` bridge (Table 2 Col B) incurs zero allocation calls for native senders; the `as_sender` bridge (Tables 1 and 3 Col B) incurs 1 al/op from the `scheduled_resume` operation state required by P2300's `schedule()` → `connect()` → `start()` protocol.
+
+### Scope and limitations
+
+This benchmark measures per-operation overhead for sequential I/O in a tight loop. It does not measure:
+
+- **Concurrent composition** — `when_all` over N streams, fan-out patterns.
+- **Real I/O latency** — io_uring submit/complete cycles, network round-trips.
+- **Multi-threaded work distribution** — cross-thread scheduling, work stealing, NUMA-aware dispatch.
+- **Compile time and diagnostic quality** — template instantiation depth, error message clarity.
+
+
+## Methodology
+
+**Execution models** (one per table):
+
+- **sender/receiver pipeline** — Pure sender pipeline using stdexec's `repeat_until` [19] + `let_value`. No coroutines. Driven by `sender_thread_pool` via `sync_wait`. The `repeat_until` algorithm wraps each iteration with a `trampoline_scheduler` [19] that bounds recursion depth and stack consumption, preventing stack overflow from repeated inline completions.
+- **capy::task** — Capy's coroutine task, driven by `capy::thread_pool`. Natively consumes IoAwaitables.
+- **beman::execution::task** — Beman's P2300 coroutine task [1], driven by `sender_thread_pool`. Natively consumes senders. **Note:** `bex::task`'s `await_transform` checks `as_awaitable` on the sender (first-priority dispatch per [exec.as.awaitable]). When the sender provides an `as_awaitable` member — as the benchmark's `sndr_read_stream` does — the task calls it directly, bypassing `connect`/`start` entirely. Table 3's native sender column (Col A) therefore measures the `as_awaitable` path, not the full sender protocol. This is the best-case scenario for senders in coroutines. See *Table 3 and `as_awaitable`* in the Analysis section for implications.
+
+**Stream abstraction levels** (one per row):
+
+- **Native** — Concrete stream type, fully visible to the compiler. No virtual dispatch or type erasure.
+- **Abstract** — Virtual base class. The caller sees an interface; the implementation is hidden behind virtual dispatch.
+- **Type-erased** — Value-type erasure. `capy::any_read_stream` for awaitables (zero steady-state allocation via cached awaitable storage); `sndr_any_read_stream` for senders (heap-allocated stream, sender type erasure via SBO).
+
+**I/O return types** (one per column):
+
+- **Column A** — Native I/O type for the execution model.
+- **Column B** — Bridged I/O type (opposite protocol).
+
+The native column (A) is shown in **bold**.
+
+**Thread pools:**
+
+Both thread pools inherit from `boost::capy::execution_context`, providing the same recycling memory resource for coroutine frame allocation. Both use intrusive work queues, mutex + condition variable synchronization, and identical outstanding-work tracking with `std::atomic<std::size_t>` and `memory_order_acq_rel`.
+
+- **capy::thread_pool** — Used in Table 2 Col A. Posts `continuation&` objects via intrusive linked list (zero allocation per post).
+- **sender_thread_pool** — Used in all other cells. Posts `work_item*` intrusively when the sender's operation state inherits `work_item` (zero allocation). Has no `post(coroutine_handle<>)` — P2300 execution contexts only expose `schedule()` [20], which returns a sender. To resume a coroutine on the scheduler, the caller must go through `schedule()` → `connect()` → `start()`, heap-allocating the operation state (one allocation per post).
+
+The `schedule`/`connect`/`start` allocation path is used when IoAwaitables post through the executor adapter (Tables 1 and 3 Col B). This is a cross-protocol adaptation cost: the IoAwaitable produces a `coroutine_handle<>`, but P2300 has no way to accept a bare handle. The adapter must create a `scheduled_resume` operation state — `connect(schedule(sched), resume_receiver)` — and heap-allocate it because the coroutine is suspended and cannot host it. The operation state IS the queue node (inherits `work_item`), so no additional wrapping is needed, but the allocation is unavoidable. Real P2300 execution contexts (stdexec's `run_loop`, `static_thread_pool`) use the same intrusive queue pattern [6].
+
+**Operation state recycling:**
+
+Type-erased senders allocate their operation state (`concrete_op`) via `op_base::operator new`, which is overridden to use the same recycling memory resource used for coroutine frames. After warmup, these allocations are served from a thread-local free list in O(1) without calling global `operator new`. Both the coroutine frame recycler and the op_state recycler use the same `boost::capy::get_recycling_memory_resource()`, providing equivalent amortized allocation cost. The recycling allocator is functionally equivalent to what P3433R1 [9] proposes for allocator support in operation states.
+
+This means the benchmark shows both models at their best: coroutine frames are recycled (standard practice for coroutine-based systems), and sender operation states are recycled (the strongest available mitigation for the structural allocation). The remaining performance differences reflect irreducible overhead — allocator fast-path cost, factory dispatch, virtual calls — not allocation policy. The al/op counts in the tables reflect allocation *calls* (including recycled), not global heap hits, so the structural allocation demand is visible even when the recycler eliminates the malloc cost.
+
+**Allocation tracking:**
+
+All allocation paths go through a single counter. Global `operator new` increments `g_alloc_count` before calling `malloc`. The recycling memory resource is wrapped in a `counting_memory_resource` proxy that increments the same counter before delegating — both for type-erased sender operation states (`op_base::operator new`) and for coroutine frame allocations (`polymorphic_allocator` passed to `bex::task`). This means al/op reflects *allocation calls per operation* regardless of whether they hit the global heap or the recycler's free list. The counter measures structural allocation demand, not allocation policy.
+
+**Warmup:**
+
+The first complete pass through all cells is a warmup (results discarded). This eliminates instruction cache, branch predictor, and CPU frequency scaling effects from the first execution model measured. The 5 measured runs begin from a thermally stable state.
+
+**Compiler optimization:**
+
+Each `co_await` suspends the coroutine and posts to the thread pool's work queue, acquiring a mutex, pushing to the intrusive queue, and signaling a condition variable. These are observable side effects that prevent the compiler from eliminating the benchmark loops.
+
+## Bridge Implementations
+
+### await_sender (sender → IoAwaitable)
+
+Used in Table 2 Column B. Wraps a P2300 sender so it can be `co_await`'d inside a `capy::task`.
+
+**Mechanism:** The bridge creates a `sender_awaitable` that placement-constructs the sender's operation state into a stack-allocated buffer. A `bridge_receiver` stores the sender's completion result in a `std::variant` discriminated by completion channel (value, error_code, exception_ptr, stopped).
+
+**Synchronous completion safety:** The bridge uses an `std::atomic<bool>` exchange protocol. Both `await_suspend` (after calling `start()`) and the receiver's completion function call `done_.exchange(true, memory_order_acq_rel)`. Whichever side arrives second (sees `true` from the exchange) is responsible for resuming the coroutine. If the sender completes synchronously during `start()`, `await_suspend` detects this and returns the coroutine handle for symmetric transfer — the coroutine never actually suspends, avoiding stack corruption [5]. This is the same pattern used by stdexec's `__sender_awaitable` [15], which uses `std::atomic<bool>` with a compare-and-swap protocol and a `std::thread::id` check for the same purpose, and by beman::execution's `sender_awaitable` [1], which uses `atomic<thread::id>`.
+
+**Result routing:** The bridge inspects the sender's error completion signatures at compile time. If the sender can complete with `set_error(std::error_code)`, `await_resume` returns `io_result<T>` so the error code is a value, not an exception. Otherwise, `await_resume` returns the value directly and rethrows exceptions.
+
+**Zero bridge allocations:** The operation state lives on the coroutine frame (via placement new into a sized buffer). The receiver resumes the coroutine directly — no posting through the executor. The 0 al/op for native senders confirms this.
+
+### as_sender (IoAwaitable → sender)
+
+Used in Tables 1 and 3 Column B. Wraps an IoAwaitable so it can be consumed by the P2300 sender/receiver model.
+
+**Mechanism:** The bridge constructs a synthetic coroutine frame (`frame_cb`) — a 24-byte struct whose first two members (resume/destroy function pointers) match the coroutine frame ABI layout used by MSVC, GCC, and Clang. `coroutine_handle<>::from_address(&cb_)` produces a valid handle whose `.resume()` calls the bridge's completion callback. This avoids allocating an actual coroutine frame, unlike P2300's `connect-awaitable` which creates a bridge coroutine with a heap-allocated frame that is "not generally eligible for the heap-allocation elision optimization (HALO)" [3]. stdexec mitigates this in its `__connect_awaitable` implementation by pre-allocating the coroutine frame inline in the operation state's storage buffer (64 bytes on 64-bit systems), providing a manual HALO that avoids heap allocation when the frame fits [17]. P4126R0 [10] proposes standardizing the synthetic frame technique as a "universal continuation model."
+
+**Executor query:** The bridge obtains a Capy-compatible executor from the P2300 environment using the standard query forwarding mechanism [2, §33.9.4]. It defines a `get_io_executor` query CPO marked as a forwarding query (`forwarding_query(get_io_executor_t{})` returns `true`), ensuring it propagates through sender adapter chains via `FWD-ENV` [2, §33.9.3.5]. Since `starts_on` injects `sched_env<Scheduler>` (which only answers `get_scheduler` and `get_domain`), the bridge queries `get_scheduler(env)` — which IS forwarded — then queries the scheduler itself: `scheduler.query(get_io_executor_t{})`. The scheduler returns a Capy executor by value, which the bridge stores in the operation state. No benchmark-specific types appear in the bridge code.
+
+**`as_awaitable` customization:** The `awaitable_sender` provides an `as_awaitable(Promise&)` member, which is the first-priority dispatch in `[exec.as.awaitable]` [2, §33.9.11.8]. stdexec's implementation confirms this priority: the `as_awaitable` CPO dispatches via `__first_callable` with the member function check (`__with_member`) as the highest priority, followed by transformed sender member, simple awaitable, and finally generic sender wrapping [15]. When `co_await`'d inside a `bex::task`, beman's `await_transform` calls this member instead of wrapping the sender in `sender_awaitable`. The member creates a standard awaitable that calls the IoAwaitable's 2-argument `await_suspend(handle, io_env const*)` directly, adapting it to the standard 1-argument protocol. This eliminates a double bridge (IoAwaitable → sender → `sender_awaitable` → awaitable) that would otherwise add connect/start/variant/atomic overhead.
+
+**Completion routing:** The `frame_cb` callback calls `await_resume()` on the IoAwaitable and routes the result through P2300 completion channels based on the return type: `void` → `set_value()`, `error_code` → `set_value()`/`set_error(ec)`, other types → `set_value(T)`.
+
+## References
+
+[1] Beman Project. *execution26: Beman.Execution*. https://github.com/bemanproject/execution
+
+[2] P2300R10. *std::execution*. Niebler, Baker, Hollman, et al. https://wg21.link/P2300
+
+[3] P2006R1. *Eliminating heap-allocations in sender/receiver with connect()/start() as basis operations*. Baker, Niebler, et al. https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2006r1.pdf
+
+[4] P3187R1. *Remove ensure_started and start_detached from P2300*. https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p3187r1.pdf
+
+[5] P3552R3. *Add a Coroutine Task Type*. https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/p3552r3.html
+
+[6] NVIDIA. *stdexec: NVIDIA's reference implementation of P2300*. https://github.com/NVIDIA/stdexec
+
+[7] C++ Working Draft. *[exec.as.awaitable]*. https://eel.is/c++draft/exec.as.awaitable
+
+[8] P2079R6. *System execution context*. https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/p2079r6.html
+
+[9] P3433R1. *Allocator Support for Operation States*. https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/p3433r1.pdf
+
+[10] P4126R0. *A Universal Continuation Model*. https://isocpp.org/files/papers/P4126R0.pdf
+
+[11] P3164R4. *Early Diagnostics for Sender Expressions*. https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/p3164r4.html
+
+[12] P3557R2. *High-Quality Sender Diagnostics with Constexpr Exceptions*. https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2025/p3557r2.html
+
+[13] Beman Project. *net: Beman.Net — P2300-based networking*. https://github.com/bemanproject/net
+
+[14] Gerbino, S. *I/O Read Stream Benchmark source*. https://github.com/cppalliance/capy/tree/develop/bench/beman
+
+[15] NVIDIA/stdexec. `include/stdexec/__detail/__as_awaitable.hpp` — `as_awaitable` CPO implementation: `__first_callable` dispatch priority (line 473), `__sender_awaitable` with atomic CAS symmetric transfer recovery (lines 309–342), `__sender_awaitable` for inline-completing senders (lines 349–396), `await_ready` always returns `false` (line 109), `__sender_awaitable_base` atomic state (lines 134–141). Only `STDEXEC::task` (line 256 of `__task.hpp`) and `exec::basic_task` (line 462 of `exec/task.hpp`) define `as_awaitable` members; no algorithm sender does. https://github.com/NVIDIA/stdexec
+
+[16] NVIDIA/stdexec. `include/stdexec/__detail/__operation_states.hpp` — `start_t` CPO enforces `start()` returns `void` via `static_assert` (line 45). https://github.com/NVIDIA/stdexec
+
+[17] NVIDIA/stdexec. `include/stdexec/__detail/__connect_awaitable.hpp` — Awaitable-to-sender bridge using a bridge coroutine with manual HALO: `operator new` placement-constructs into pre-allocated 64-byte storage in the operation state (lines 169–181), avoiding heap allocation when the coroutine frame fits. https://github.com/NVIDIA/stdexec
+
+[18] NVIDIA/stdexec. `include/exec/any_sender_of.hpp` and `include/stdexec/__detail/__any.hpp` — Type-erased sender operation state uses 64-byte SBO buffer (`_iopstate_base_t` at line 397 of `any_sender_of.hpp`). `__emplace_into` (line 549 of `__any.hpp`) constructs in-place when the model fits, falling back to allocator-based heap allocation otherwise. https://github.com/NVIDIA/stdexec
+
+[19] NVIDIA/stdexec. `include/exec/repeat_until.hpp` and `include/exec/trampoline_scheduler.hpp` — `repeat_until` wraps child senders with `trampoline_scheduler` (line 154 of `repeat_until.hpp`). The trampoline tracks recursion depth (default 16) and stack consumption (default 4096 bytes), deferring to an iterative queue when limits are exceeded (lines 147–174 of `trampoline_scheduler.hpp`). https://github.com/NVIDIA/stdexec
+
+[20] NVIDIA/stdexec. `include/stdexec/__detail/__schedulers.hpp` — `schedule_t` CPO (lines 43–70): the only entry point for P2300 schedulers. No `post(coroutine_handle<>)` exists in the interface. https://github.com/NVIDIA/stdexec
+
+[21] NVIDIA/stdexec. `include/stdexec/__detail/__completion_behavior.hpp` — Completion behavior tracking system: `__inline_completion` (line 58), `__completes_inline` concept (line 201), `__completes_where_it_starts` concept (line 205). Used by `as_awaitable` to select atomic-free code paths for statically-known inline senders. https://github.com/NVIDIA/stdexec
+
+[22] Meta/libunifex. `include/unifex/await_transform.hpp`, `include/unifex/connect_awaitable.hpp`, `include/unifex/any_sender_of.hpp`, `include/unifex/repeat_effect_until.hpp`, `include/unifex/trampoline_scheduler.hpp` — Meta's prototype sender/receiver implementation (predates P2300 standardization). `_as_awaitable::await_ready()` unconditionally returns `false` (line 176 of `await_transform.hpp`). `await_suspend` calls `start()` and returns `void` with no atomic exchange protocol for synchronous completion detection (line 213). `connect_awaitable` uses a bridge coroutine without inline-storage manual HALO (line 188 of `connect_awaitable.hpp`). `any_sender_of` heap-allocates all type-erased operation states via `any_unique_t` with no SBO (line 52 of `any_sender_of.hpp`). `repeat_effect_until` uses direct recursion with no built-in trampoline (line 63 of `repeat_effect_until.hpp`); separate `trampoline_scheduler` exists but is not integrated (line 31 of `trampoline_scheduler.hpp`). https://github.com/facebookexperimental/libunifex
diff --git a/doc/local-playbook.yml b/doc/local-playbook.yml
index 049758bfb..16e9ef3db 100644
--- a/doc/local-playbook.yml
+++ b/doc/local-playbook.yml
@@ -10,7 +10,7 @@ content:
   sources:
     - url: ..
       start_path: doc
-      edit_url: 'https://github.com/cppalliance/capy/edit/{refname}/{path}'
+      edit_url: 'https://github.com/cppalliance/capy/edit/develop/{path}'
 
 ui:
   bundle:
@@ -35,6 +35,7 @@ antora:
           tag: 'develop'
           variable: 'BOOST_SRC_DIR'
           system-env: 'BOOST_SRC_DIR'
+    - require: '@cppalliance/antora-downloads-extension'
 
 asciidoc:
   attributes:
diff --git a/doc/modules/ROOT/nav.adoc b/doc/modules/ROOT/nav.adoc
index fb6125afc..e6e195b7b 100644
--- a/doc/modules/ROOT/nav.adoc
+++ b/doc/modules/ROOT/nav.adoc
@@ -19,6 +19,7 @@
 ** xref:4.coroutines/4e.cancellation.adoc[Stop Tokens and Cancellation]
 ** xref:4.coroutines/4f.composition.adoc[Concurrent Composition]
 ** xref:4.coroutines/4g.allocators.adoc[Frame Allocators]
+** xref:4.coroutines/4h.lambda-captures.adoc[Lambda Coroutine Captures]
 * xref:5.buffers/5.intro.adoc[Buffer Sequences]
 ** xref:5.buffers/5a.overview.adoc[Why Concepts, Not Spans]
 ** xref:5.buffers/5b.types.adoc[Buffer Types]
@@ -33,31 +34,44 @@
 ** xref:6.streams/6d.buffer-concepts.adoc[Buffer Sources and Sinks]
 ** xref:6.streams/6e.algorithms.adoc[Transfer Algorithms]
 ** xref:6.streams/6f.isolation.adoc[Physical Isolation]
-* xref:7.examples/7.intro.adoc[Example Programs]
-** xref:7.examples/7a.hello-task.adoc[Hello Task]
-** xref:7.examples/7b.producer-consumer.adoc[Producer-Consumer]
-** xref:7.examples/7c.buffer-composition.adoc[Buffer Composition]
-** xref:7.examples/7d.mock-stream-testing.adoc[Mock Stream Testing]
-** xref:7.examples/7e.type-erased-echo.adoc[Type-Erased Echo]
-** xref:7.examples/7f.timeout-cancellation.adoc[Timeout with Cancellation]
-** xref:7.examples/7g.parallel-fetch.adoc[Parallel Fetch]
-** xref:7.examples/7h.custom-dynamic-buffer.adoc[Custom Dynamic Buffer]
-** xref:7.examples/7i.echo-server-corosio.adoc[Echo Server with Corosio]
-** xref:7.examples/7j.stream-pipeline.adoc[Stream Pipeline]
-* xref:8.design/8.intro.adoc[Design]
-** xref:8.design/8a.CapyLayering.adoc[Layered Abstractions]
-** xref:8.design/8b.Separation.adoc[Why Capy Is Separate]
-** xref:8.design/8c.ReadStream.adoc[ReadStream]
-** xref:8.design/8d.ReadSource.adoc[ReadSource]
-** xref:8.design/8e.BufferSource.adoc[BufferSource]
-** xref:8.design/8f.WriteStream.adoc[WriteStream]
-** xref:8.design/8g.WriteSink.adoc[WriteSink]
-** xref:8.design/8h.BufferSink.adoc[BufferSink]
-** xref:8.design/8i.TypeEraseAwaitable.adoc[Type-Erasing Awaitables]
-** xref:8.design/8j.any_buffer_sink.adoc[AnyBufferSink]
-** xref:8.design/8k.Executor.adoc[Executor]
-** xref:8.design/8l.RunApi.adoc[Run API]
-** xref:8.design/8m.WhyNotCobalt.adoc[Why Not Cobalt?]
-** xref:8.design/8n.WhyNotCobaltConcepts.adoc[Why Not Cobalt Concepts?]
-** xref:8.design/8o.WhyNotTMC.adoc[Why Not TooManyCooks?]
+* xref:7.testing/7.intro.adoc[Testing]
+** xref:7.testing/7a.drivers.adoc[Driving Tests]
+** xref:7.testing/7b.mock-streams.adoc[Mock Streams]
+** xref:7.testing/7c.mock-sources-sinks.adoc[Mock Sources and Sinks]
+** xref:7.testing/7d.mock-buffer-concepts.adoc[Mock Buffer Sources and Sinks]
+** xref:7.testing/7e.buffer-inspection.adoc[Buffer Inspection]
+* xref:8.examples/8.intro.adoc[Example Programs]
+** xref:8.examples/8a.hello-task.adoc[Hello Task]
+** xref:8.examples/8b.producer-consumer.adoc[Producer-Consumer]
+** xref:8.examples/8c.buffer-composition.adoc[Buffer Composition]
+** xref:8.examples/8d.mock-stream-testing.adoc[Mock Stream Testing]
+** xref:8.examples/8e.type-erased-echo.adoc[Type-Erased Echo]
+** xref:8.examples/8f.timeout-cancellation.adoc[Timeout with Cancellation]
+** xref:8.examples/8g.parallel-fetch.adoc[Parallel Fetch]
+** xref:8.examples/8h.custom-dynamic-buffer.adoc[Custom Dynamic Buffer]
+** xref:8.examples/8i.echo-server-corosio.adoc[Echo Server with Corosio]
+** xref:8.examples/8j.stream-pipeline.adoc[Stream Pipeline]
+** xref:8.examples/8k.strand-serialization.adoc[Strand Serialization]
+** xref:8.examples/8l.async-mutex.adoc[Async Mutex]
+** xref:8.examples/8m.parallel-tasks.adoc[Parallel Tasks]
+** xref:8.examples/8n.custom-executor.adoc[Custom Executor]
+* xref:9.design/9.intro.adoc[Design]
+** xref:9.design/9a.CapyLayering.adoc[Layered Abstractions]
+** xref:9.design/9b.Separation.adoc[Why Capy Is Separate]
+** xref:9.design/9c.ReadStream.adoc[ReadStream]
+** xref:9.design/9d.ReadSource.adoc[ReadSource]
+** xref:9.design/9e.BufferSource.adoc[BufferSource]
+** xref:9.design/9f.WriteStream.adoc[WriteStream]
+** xref:9.design/9g.WriteSink.adoc[WriteSink]
+** xref:9.design/9h.BufferSink.adoc[BufferSink]
+** xref:9.design/9i.TypeEraseAwaitable.adoc[Type-Erasing Awaitables]
+** xref:9.design/9j.any_buffer_sink.adoc[AnyBufferSink]
+** xref:9.design/9k.Executor.adoc[Executor]
+** xref:9.design/9l.RunApi.adoc[Run API]
+** xref:9.design/9m.WhyNotCobalt.adoc[Why Not Cobalt?]
+** xref:9.design/9n.WhyNotCobaltConcepts.adoc[Why Not Cobalt Concepts?]
+** xref:9.design/9o.WhyNotTMC.adoc[Why Not TooManyCooks?]
+* xref:A.specification-methods/A.intro.adoc[Methods of API Description]
+** xref:A.specification-methods/Ab.cancellation.adoc[Cancellation]
+** xref:A.specification-methods/Ac.contingencies.adoc[Contingencies]
 * xref:reference:boost/capy.adoc[Reference]
diff --git a/doc/modules/ROOT/pages/2.cpp20-coroutines/2c.machinery.adoc b/doc/modules/ROOT/pages/2.cpp20-coroutines/2c.machinery.adoc
index ef6ea5967..ef021c90a 100644
--- a/doc/modules/ROOT/pages/2.cpp20-coroutines/2c.machinery.adoc
+++ b/doc/modules/ROOT/pages/2.cpp20-coroutines/2c.machinery.adoc
@@ -37,25 +37,29 @@ Called if an exception escapes the coroutine body. Typically you either rethrow
 
 The compiler transforms your coroutine body into something resembling this pseudocode:
 
+NOTE: The `co_await` keywords below are intentional. This mirrors the {cpp} standard's own description ({cpp}20 [dcl.fct.def.coroutine]/5), which uses `co_await` to express the logical suspension points. The compiler expands each `co_await` into the full awaiter protocol (`await_ready`, `await_suspend`, `await_resume`) as described in xref:2.cpp20-coroutines/2b.syntax.adoc#_awaitables_and_awaiters[Awaitables and Awaiters].
+
 [source,cpp]
 ----
 {
     promise_type promise;
     auto return_object = promise.get_return_object();
-    
-    co_await promise.initial_suspend();
-    
+
+    co_await promise.initial_suspend();  // <1>
+
     try {
         // your coroutine body goes here
     }
     catch (...) {
         promise.unhandled_exception();
     }
-    
-    co_await promise.final_suspend();
+
+    co_await promise.final_suspend();  // <2>
 }
 // coroutine frame is destroyed when control flows off the end
 ----
+<1> Suspension point before the body runs. If `initial_suspend()` returns `suspend_always`, the coroutine starts suspended.
+<2> Suspension point after the body completes. If `final_suspend()` returns `suspend_always`, the frame persists for the caller to inspect or destroy.
 
 Important observations:
 
diff --git a/doc/modules/ROOT/pages/4.coroutines/4a.tasks.adoc b/doc/modules/ROOT/pages/4.coroutines/4a.tasks.adoc
index 2c438c2d0..8a14c71ed 100644
--- a/doc/modules/ROOT/pages/4.coroutines/4a.tasks.adoc
+++ b/doc/modules/ROOT/pages/4.coroutines/4a.tasks.adoc
@@ -17,7 +17,7 @@ Key characteristics:
 * *Symmetric transfer* — Efficient resumption without stack accumulation
 * *Executor inheritance* — Inherits the caller's executor unless explicitly bound
 * *Stop token propagation* — Forward-propagates cancellation signals
-* *HALO support* — Enables heap allocation elision when possible
+* *xref:4.coroutines/4g.allocators.adoc#_halo_optimization[HALO] support* — Enables Heap Allocation eLision Optimization when possible
 
 == Declaring task Coroutines
 
diff --git a/doc/modules/ROOT/pages/4.coroutines/4b.launching.adoc b/doc/modules/ROOT/pages/4.coroutines/4b.launching.adoc
index 6d535ddbd..cbcd2eab0 100644
--- a/doc/modules/ROOT/pages/4.coroutines/4b.launching.adoc
+++ b/doc/modules/ROOT/pages/4.coroutines/4b.launching.adoc
@@ -91,21 +91,6 @@ run_async(ex,
 
 When no handlers are provided, results are discarded and exceptions are rethrown (causing `std::terminate` if uncaught).
 
-=== Stop Token Support
-
-Pass a stop token to enable cooperative cancellation:
-
-[source,cpp]
-----
-std::stop_source source;
-run_async(ex, source.get_token())(cancellable_task());
-
-// Later, to request cancellation:
-source.request_stop();
-----
-
-The stop token is propagated to the task and all tasks it awaits.
-
 == run: Executor Hopping Within Coroutines
 
 Inside a coroutine, use `run` to execute a child task on a different executor:
@@ -136,6 +121,48 @@ This pattern is useful for:
 * Performing I/O on an I/O-specific context
 * Ensuring UI updates happen on the UI thread
 
+== Stop Token Propagation
+
+Both `run_async` and `run` propagate stop tokens to the launched task and all tasks it awaits. The task accesses its token via `co_await this_coro::stop_token`.
+
+=== Injecting a Token with run_async
+
+Since `run_async` is called from non-coroutine code, there is no caller token to inherit. Pass a stop token explicitly:
+
+[source,cpp]
+----
+std::stop_source source;
+run_async(ex, source.get_token())(cancellable_task());
+
+// Later, to request cancellation:
+source.request_stop();
+----
+
+=== Inheritance with run
+
+`run` is called from within a coroutine, so it inherits the caller's stop token by default:
+
+[source,cpp]
+----
+task<void> parent()
+{
+    // Child automatically receives our stop token
+    co_await run(pool.get_executor())(child_task());
+}
+----
+
+To override with a different token, pass it explicitly:
+
+[source,cpp]
+----
+task<void> parent()
+{
+    std::stop_source local;
+    // Child gets local's token, not our caller's
+    co_await run(pool.get_executor(), local.get_token())(child_task());
+}
+----
+
 == Handler Threading
 
 Handlers passed to `run_async` are invoked on whatever thread the executor schedules:
diff --git a/doc/modules/ROOT/pages/4.coroutines/4c.executors.adoc b/doc/modules/ROOT/pages/4.coroutines/4c.executors.adoc
index 2361ae95a..4839686f8 100644
--- a/doc/modules/ROOT/pages/4.coroutines/4c.executors.adoc
+++ b/doc/modules/ROOT/pages/4.coroutines/4c.executors.adoc
@@ -9,13 +9,13 @@ This section explains executors and execution contexts—the mechanisms that con
 
 == The Executor Concept
 
-An *executor* is an object that can schedule work for execution. In Capy, executors must provide two methods:
+An *executor* is an object that can schedule work for execution. In Capy, executors must provide three methods:
 
 [source,cpp]
 ----
-concept Executor = requires(E ex, std::coroutine_handle<> h) {
-    { ex.dispatch(h) } -> std::same_as<std::coroutine_handle<>>;
-    { ex.post(h) } -> std::same_as<void>;
+concept Executor = requires(E ex, continuation& c) {
+    { ex.dispatch(c) } -> std::same_as<std::coroutine_handle<>>;
+    { ex.post(c) } -> std::same_as<void>;
     { ex.context() } -> std::convertible_to<execution_context&>;
 };
 ----
@@ -24,11 +24,11 @@ concept Executor = requires(E ex, std::coroutine_handle<> h) {
 
 Both methods schedule a coroutine for execution, but with different semantics:
 
-`dispatch(h)`::
-May execute `h` inline if the current thread is already associated with the executor. Returns a coroutine handle—either `h` if execution was deferred, or `std::noop_coroutine()` if `h` was executed immediately. This enables symmetric transfer optimization.
+`dispatch(c)`::
+May execute inline if the current thread is already associated with the executor. Returns a coroutine handle—either `c.h` for inline resumption via symmetric transfer, or `std::noop_coroutine()` if the work was queued. This enables symmetric transfer optimization.
 
-`post(h)`::
-Always queues `h` for later execution. Never executes inline. Returns void. Use when you need guaranteed asynchrony.
+`post(c)`::
+Always queues the continuation for later execution. Never executes inline. Returns void. Use when you need guaranteed asynchrony.
 
 === context()
 
@@ -40,9 +40,9 @@ Returns a reference to the execution context that owns this executor. The contex
 
 [source,cpp]
 ----
-void schedule_work(executor_ref ex, std::coroutine_handle<> h)
+void schedule_work(executor_ref ex, continuation& c)
 {
-    ex.post(h);  // Works with any executor
+    ex.post(c);  // Works with any executor
 }
 
 int main()
diff --git a/doc/modules/ROOT/pages/4.coroutines/4d.io-awaitable.adoc b/doc/modules/ROOT/pages/4.coroutines/4d.io-awaitable.adoc
index 59d022f3a..80b6f2df9 100644
--- a/doc/modules/ROOT/pages/4.coroutines/4d.io-awaitable.adoc
+++ b/doc/modules/ROOT/pages/4.coroutines/4d.io-awaitable.adoc
@@ -166,6 +166,55 @@ The key points:
 2. Use the executor to dispatch completion
 3. Respect the stop token for cancellation
 
+=== Stop Callbacks Must Post, Not Resume
+
+When implementing a stoppable awaitable, you may register a `std::stop_callback` to wake the coroutine when cancellation is requested. The callback fires synchronously on whatever thread calls `request_stop()`, which is typically *not* the executor's thread.
+
+[WARNING]
+====
+*Never resume a coroutine handle directly from a stop_callback.* Doing so executes the coroutine on the wrong thread, corrupting the thread-local frame allocator. This causes use-after-free on the next coroutine allocation—potentially in completely unrelated code.
+====
+
+Post the resume through the executor instead of resuming inline:
+
+[source,cpp]
+----
+struct stoppable_awaitable
+{
+    mutable continuation cont_;
+
+    bool await_ready() { return false; }
+
+    std::coroutine_handle<> await_suspend(
+        std::coroutine_handle<> h, io_env const* env)
+    {
+        if (env->stop_token.stop_requested())
+            return h;  // Already cancelled, resume immediately
+
+        // Post through executor when stop is requested
+        cont_.h = h;
+        auto ex = env->executor;
+        stop_cb_.emplace(env->stop_token,
+            [this, ex]() mutable noexcept { ex.post(cont_); });
+
+        start_async_operation();
+        return std::noop_coroutine();
+    }
+
+    void await_resume() { /* ... */ }
+};
+----
+
+The incorrect pattern—which compiles and appears to work but causes memory corruption—looks like this:
+
+[source,cpp]
+----
+// WRONG: resumes coroutine on the calling thread
+stop_cb_.emplace(env->stop_token, h);  // h is a raw coroutine_handle
+----
+
+See xref:4.coroutines/4e.cancellation.adoc#stoppable-awaitables[Implementing Stoppable Awaitables] for a complete example.
+
 == Reference
 
 [cols="1,3"]
diff --git a/doc/modules/ROOT/pages/4.coroutines/4e.cancellation.adoc b/doc/modules/ROOT/pages/4.coroutines/4e.cancellation.adoc
index 7a695dd21..9e8ee15eb 100644
--- a/doc/modules/ROOT/pages/4.coroutines/4e.cancellation.adoc
+++ b/doc/modules/ROOT/pages/4.coroutines/4e.cancellation.adoc
@@ -216,7 +216,7 @@ Inside a task, use `get_stop_token()` to access the current stop token:
 task<> cancellable_work()
 {
     auto token = co_await get_stop_token();
-    
+
     while (!token.stop_requested())
     {
         co_await do_chunk_of_work();
@@ -224,6 +224,17 @@ task<> cancellable_work()
 }
 ----
 
+=== Why Not `coroutine_handle::destroy()`?
+
+`std::coroutine_handle::destroy()` is the {cpp}20 primitive that frees a coroutine frame. It is not a cancellation mechanism, and it has the same flaw as forceful thread interruption: the coroutine is torn down with no opportunity to complete pending I/O, release locks, or run RAII destructors in the expected order.
+
+Capy exposes `task::handle()` and `quitter::handle()` so that Capy's own launchers (`run_async`, `run`) and custom integrations can dispatch coroutines through executors. Calling `destroy()` on such a handle while the coroutine is being awaited by a parent produces undefined behavior: the destruction cascades back through the parent's continuation, re-entering frame destruction that is already in progress.
+
+The rule:
+
+* To cancel work, request a stop on a `std::stop_source` whose token the work observes. The work unwinds cleanly through `final_suspend` and any RAII guards run in the correct order.
+* Do not call `destroy()` on a handle returned by `task::handle()` or `quitter::handle()` while the coroutine is being awaited.
+
 == Part 6: Responding to Cancellation
 
 === Checking the Token
@@ -292,7 +303,69 @@ Capy's I/O operations (provided by Corosio) respect stop tokens at the OS level:
 
 When you request stop, pending I/O operations are cancelled at the OS level, providing immediate response rather than waiting for the operation to complete naturally.
 
-== Part 8: Patterns
+[[stoppable-awaitables]]
+== Part 8: Implementing Stoppable Awaitables
+
+The examples above show *polling* for cancellation with `token.stop_requested()`. For awaitables that suspend indefinitely—waiting for I/O, a lock, or an external event—you need a `std::stop_callback` to wake the coroutine when cancellation arrives.
+
+=== The Dangerous Pattern
+
+A `std::stop_callback` fires synchronously on whatever thread calls `request_stop()`. If the callback resumes the coroutine directly, the coroutine runs on the wrong thread:
+
+[source,cpp]
+----
+// WRONG — causes use-after-free
+std::optional<std::stop_callback<std::coroutine_handle<>>> stop_cb;
+
+std::coroutine_handle<> await_suspend(
+    std::coroutine_handle<> h, io_env const* env)
+{
+    stop_cb.emplace(env->stop_token, h);  // Resumes inline!
+    return std::noop_coroutine();
+}
+----
+
+When an external thread calls `request_stop()`, `h.resume()` executes the coroutine on that thread. The coroutine machinery sets the thread-local frame allocator to the executor's allocator—poisoning the calling thread's TLS. When the executor's pool destructs, the TLS pointer becomes dangling. The next coroutine allocation on that thread dereferences freed memory.
+
+This bug is deterministic, not a race condition. It manifests as a heap-use-after-free in *unrelated* code—wherever the next coroutine frame happens to be allocated on the poisoned thread.
+
+=== The Correct Pattern: Post Through the Executor
+
+Post the coroutine handle through the executor instead of resuming it inline. This ensures the coroutine runs on the correct thread:
+
+[source,cpp]
+----
+struct my_stoppable_awaitable
+{
+    mutable continuation cont_;
+    // ... other members for the async operation ...
+
+    bool await_ready() { return false; }
+
+    std::coroutine_handle<> await_suspend(
+        std::coroutine_handle<> h, io_env const* env)
+    {
+        if (env->stop_token.stop_requested())
+            return h;  // Already cancelled
+
+        cont_.h = h;
+        auto ex = env->executor;
+        stop_cb_.emplace(env->stop_token,
+            [this, ex]() mutable noexcept { ex.post(cont_); });
+
+        start_async_operation(h, env);
+        return std::noop_coroutine();
+    }
+
+    void await_resume() { /* check result or throw */ }
+};
+----
+
+When `request_stop()` fires the callback, the coroutine handle is posted to the executor's queue instead of resumed inline. The executor's worker thread picks it up and resumes it in the correct execution context.
+
+NOTE: Capy's built-in I/O awaitables (via Corosio) already use the post-back pattern internally. This guidance applies when writing your own custom awaitables.
+
+== Part 9: Patterns
 
 === Timeout Pattern
 
@@ -316,25 +389,36 @@ task<> with_timeout(task<> operation, std::chrono::seconds timeout)
 
 === User Cancellation
 
-Connect UI cancellation to stop tokens:
+Connect UI cancellation to stop tokens. Pass the token through `run_async` so it propagates automatically via the execution environment—the task accesses it with `co_await this_coro::stop_token` instead of receiving it as a function argument:
 
 [source,cpp]
 ----
 class download_manager
 {
+    executor_ref executor_;
     std::stop_source stop_source_;
-    
+
 public:
     void start_download(std::string url)
     {
-        run_async(executor_)(download(url, stop_source_.get_token()));
+        // Token propagated via io_env, not as a function argument
+        run_async(executor_, stop_source_.get_token())(download(url));
     }
-    
+
     void cancel()
     {
         stop_source_.request_stop();
     }
 };
+
+task<void> download(std::string url)
+{
+    auto token = co_await this_coro::stop_token;  // From run_async's io_env
+    while (!token.stop_requested())
+    {
+        co_await fetch_next_chunk(url);
+    }
+}
 ----
 
 === Graceful Shutdown
diff --git a/doc/modules/ROOT/pages/4.coroutines/4f.composition.adoc b/doc/modules/ROOT/pages/4.coroutines/4f.composition.adoc
index 23135588e..d4750b19b 100644
--- a/doc/modules/ROOT/pages/4.coroutines/4f.composition.adoc
+++ b/doc/modules/ROOT/pages/4.coroutines/4f.composition.adoc
@@ -34,79 +34,95 @@ task<> concurrent()
 
 == when_all: Wait for All Tasks
 
-`when_all` launches multiple tasks concurrently and waits for all of them to complete:
+`when_all` launches multiple `io_task` children concurrently and waits for all of them to complete. It returns `task<io_result<R1, R2, ..., Rn>>`, a single `ec` plus the flattened payloads:
 
 [source,cpp]
 ----
 #include <boost/capy/when_all.hpp>
 
-task<int> fetch_a() { co_return 1; }
-task<int> fetch_b() { co_return 2; }
-task<std::string> fetch_c() { co_return "hello"; }
+io_task<int> fetch_a() { co_return io_result<int>{{}, 1}; }
+io_task<int> fetch_b() { co_return io_result<int>{{}, 2}; }
+io_task<std::string> fetch_c() { co_return io_result<std::string>{{}, "hello"}; }
 
 task<> example()
 {
-    auto [a, b, c] = co_await when_all(fetch_a(), fetch_b(), fetch_c());
-    
+    auto [ec, a, b, c] = co_await when_all(fetch_a(), fetch_b(), fetch_c());
+
+    // ec == std::error_code{} (success)
     // a == 1
     // b == 2
     // c == "hello"
 }
 ----
 
-=== Result Tuple
+=== Result Type
 
-`when_all` returns a tuple of results in the same order as the input tasks. Use structured bindings to unpack them.
+`when_all` returns `io_result<R1, ..., Rn>` where each `Ri` is the child's payload flattened: `io_result<T>` contributes `T`, `io_result<>` contributes `tuple<>`. Check `ec` first; values are only meaningful when `!ec`.
 
-=== Void Filtering
+=== Void io_tasks
 
-Tasks returning `void` do not contribute to the result tuple:
+`io_task<>` children contribute `tuple<>` to the result:
 
 [source,cpp]
 ----
-task<> void_task() { co_return; }
-task<int> int_task() { co_return 42; }
+io_task<> void_task() { co_return io_result<>{}; }
+io_task<int> int_task() { co_return io_result<int>{{}, 42}; }
 
 task<> example()
 {
-    auto [value] = co_await when_all(void_task(), int_task(), void_task());
-    // value == 42  (only the int_task contributes)
+    auto [ec, a, b, c] = co_await when_all(int_task(), void_task(), int_task());
+    // a == 42       (int)
+    // b == tuple<>  (from void io_task)
+    // c == 42       (int)
 }
 ----
 
-If all tasks return `void`, `when_all` returns `void`:
+When all children are `io_task<>`, just check `r.ec`:
 
 [source,cpp]
 ----
 task<> example()
 {
-    co_await when_all(void_task_a(), void_task_b());  // Returns void
+    auto r = co_await when_all(void_task_a(), void_task_b());
+    if (r.ec)
+        // handle error
 }
 ----
 
 === Error Handling
 
-If any task throws an exception:
+I/O errors are reported through the `ec` field of the `io_result`. When any child returns a non-zero `ec`:
 
-1. The exception is captured
-2. Stop is requested for sibling tasks
-3. All tasks are allowed to complete (or respond to stop)
-4. The *first* exception is rethrown; later exceptions are discarded
+1. Stop is requested for sibling tasks
+2. All tasks complete (or respond to stop)
+3. The first `ec` is propagated in the outer `io_result`
 
 [source,cpp]
 ----
-task<int> might_fail(bool fail)
+task<> example()
+{
+    auto [ec, a, b] = co_await when_all(task_a(), task_b());
+    if (ec)
+        std::cerr << "Error: " << ec.message() << "\n";
+}
+----
+
+If a task throws an exception, it is captured and rethrown after all tasks complete. Exceptions take priority over `ec`.
+
+[source,cpp]
+----
+io_task<int> might_throw(bool fail)
 {
     if (fail)
         throw std::runtime_error("failed");
-    co_return 42;
+    co_return io_result<int>{{}, 42};
 }
 
 task<> example()
 {
     try
     {
-        co_await when_all(might_fail(true), might_fail(false));
+        co_await when_all(might_throw(true), might_throw(false));
     }
     catch (std::runtime_error const& e)
     {
@@ -121,23 +137,24 @@ When one task fails, `when_all` requests stop for its siblings. Well-behaved tas
 
 [source,cpp]
 ----
-task<> long_running()
+io_task<> long_running()
 {
-    auto token = co_await get_stop_token();
-    
+    auto token = co_await this_coro::stop_token;
+
     for (int i = 0; i < 1000; ++i)
     {
         if (token.stop_requested())
-            co_return;  // Exit early when sibling fails
-        
+            co_return io_result<>{};  // Exit early when sibling fails
+
         co_await do_iteration();
     }
+    co_return io_result<>{};
 }
 ----
 
-== when_any: First-to-Finish Wins
+== when_any: First-to-Succeed Wins
 
-`when_any` launches multiple tasks concurrently and returns when the *first* one completes:
+`when_any` launches multiple `io_task` children concurrently and returns when the first one *succeeds* (`!ec`):
 
 [source,cpp]
 ----
@@ -145,18 +162,20 @@ task<> long_running()
 
 task<> example()
 {
-    auto [index, result] = co_await when_any(
-        fetch_int(),     // task<int>
-        fetch_string()   // task<std::string>
+    auto result = co_await when_any(
+        fetch_int(),     // io_task<int>
+        fetch_string()   // io_task<std::string>
     );
-    // index indicates which task won (0 or 1)
-    // result is std::variant<int, std::string>
+    // result is std::variant<std::error_code, int, std::string>
+    // index 0: all tasks failed (error_code)
+    // index 1: fetch_int won
+    // index 2: fetch_string won
 }
 ----
 
-The result is a pair containing the winner's index and a deduplicated variant of possible result types. When a winner is determined, stop is requested for all siblings. All tasks complete before `when_any` returns.
+The result is a `variant` with `error_code` at index 0 (failure/no winner) and one alternative per input task at indices 1..N. Only tasks returning `!ec` can win; errors and exceptions do not count as winning. When a winner is found, stop is requested for all siblings. All tasks complete before `when_any` returns.
 
-For detailed coverage including error handling, cancellation, and the vector overload, see Racing Tasks.
+For detailed coverage including error handling, cancellation, and the range overload, see Racing Tasks.
 
 == Practical Patterns
 
@@ -166,66 +185,68 @@ Fetch multiple resources simultaneously:
 
 [source,cpp]
 ----
-task<page_data> fetch_page_data(std::string url)
+io_task<page_data> fetch_page_data(std::string url)
 {
-    auto [header, body, sidebar] = co_await when_all(
+    auto [ec, header, body, sidebar] = co_await when_all(
         fetch_header(url),
         fetch_body(url),
         fetch_sidebar(url)
     );
-    
-    co_return page_data{
+    if (ec)
+        co_return io_result<page_data>{ec, {}};
+
+    co_return io_result<page_data>{{}, {
         std::move(header),
         std::move(body),
         std::move(sidebar)
-    };
+    }};
 }
 ----
 
 === Fan-Out/Fan-In
 
-Process items in parallel, then combine results:
+Process items in parallel, then combine results using the range overload:
 
 [source,cpp]
 ----
-task<int> process_item(item const& i);
+io_task<int> process_item(item const& i);
 
 task<int> process_all(std::vector<item> const& items)
 {
-    std::vector<task<int>> tasks;
+    std::vector<io_task<int>> tasks;
     for (auto const& item : items)
         tasks.push_back(process_item(item));
-    
-    // This requires a range-based when_all (not yet available)
-    // For now, use fixed-arity when_all
-    
+
+    auto [ec, results] = co_await when_all(std::move(tasks));
+    if (ec)
+        co_return 0;
+
     int total = 0;
-    // ... accumulate results
+    for (auto v : results)
+        total += v;
     co_return total;
 }
 ----
 
-=== Timeout with Fallback
+=== Timeout
 
-Use `when_any` to implement timeout with fallback:
+The `timeout` combinator races an awaitable against a deadline:
 
 [source,cpp]
 ----
-task<Response> fetch_with_timeout(Request req)
-{
-    auto [index, result] = co_await when_any(
-        fetch_data(req),
-        timeout_after<Response>(100ms)
-    );
-
-    if (index == 1)
-        throw timeout_error{"Request timed out"};
+#include <boost/capy/timeout.hpp>
 
-    co_return std::get<Response>(result);
+task<> example()
+{
+    auto [ec, n] = co_await timeout(sock.read_some(buf), 50ms);
+    if (ec == cond::timeout)
+    {
+        // deadline expired before read completed
+    }
 }
 ----
 
-The `timeout_after` helper waits for the specified duration then throws. If `fetch_data` completes first, its result is returned. If the timer wins, the timeout exception propagates.
+`timeout` returns the same `io_result` type as the inner awaitable. On timeout, `ec` is set to `error::timeout` and payload values are default-initialized. Unlike `when_any`, exceptions from the inner awaitable are always propagated and never swallowed by the timer.
 
 == Implementation Notes
 
@@ -259,6 +280,9 @@ This design ensures proper context propagation to all children.
 
 | `<boost/capy/when_any.hpp>`
 | First-completion racing with when_any
+
+| `<boost/capy/timeout.hpp>`
+| Race an awaitable against a deadline
 |===
 
 You have now learned how to compose tasks concurrently with `when_all` and `when_any`. In the next section, you will learn about frame allocators for customizing coroutine memory allocation.
diff --git a/doc/modules/ROOT/pages/4.coroutines/4g.allocators.adoc b/doc/modules/ROOT/pages/4.coroutines/4g.allocators.adoc
index a3e5fa104..b0cdeadac 100644
--- a/doc/modules/ROOT/pages/4.coroutines/4g.allocators.adoc
+++ b/doc/modules/ROOT/pages/4.coroutines/4g.allocators.adoc
@@ -41,6 +41,20 @@ The "window" is the interval between setting the thread-local allocator and the
 
 After the window closes (at the first suspension), the TLS allocator may be restored to a previous value. The task retains its captured allocator regardless.
 
+== TLS Preservation
+
+Between a coroutine's `await_resume` (which sets TLS to the correct allocator) and the next child coroutine invocation (whose `operator new` reads TLS), arbitrary user code runs. If that code resumes a coroutine from a different chain on the same thread -- by calling `.resume()` directly, pumping a completion queue, or running nested dispatch -- the other coroutine's `await_resume` overwrites TLS with its own allocator. The original coroutine's next child would then allocate from the wrong resource.
+
+To prevent this, any code that calls `.resume()` on a coroutine handle must use `safe_resume` from `<boost/capy/ex/frame_allocator.hpp>`:
+
+[source,cpp]
+----
+// In your event loop or dispatch path:
+capy::safe_resume(h);   // saves and restores TLS around h.resume()
+----
+
+`safe_resume` saves the current thread-local allocator, calls `h.resume()`, then restores the saved value. This makes TLS behave like a stack: nested resumes cannot spoil the outer value. All of Capy's built-in executors (`thread_pool`, strands, `blocking_context`) use `safe_resume` internally. Custom executor event loops must do the same -- see xref:8.examples/8n.custom-executor.adoc[Custom Executor] for an example.
+
 == The FrameAllocator Concept
 
 Custom allocators must satisfy the `FrameAllocator` concept, which is compatible with {cpp} allocator requirements:
@@ -162,6 +176,84 @@ void process_batch(std::vector<item> const& items)
 }
 ----
 
+=== Scope Variables to Reduce Frame Size
+
+Compilers use declaration scope (braces) to decide which variables cross suspend points and must live in the coroutine frame. Variables declared in an outer scope remain in the frame even after their last use, as long as a `co_await` follows within the same scope.
+
+Wrapping buffer usage in explicit braces can dramatically reduce frame size:
+
+[source,cpp]
+----
+// BAD: buf lives in frame across all subsequent co_awaits
+task<> process(stream& s)
+{
+    char buf[4096];
+    auto [ec, n] = co_await s.read_some(buf);
+    co_await do_work(buf, n);
+    co_await s.write_some(reply);   // buf wastes 4K in frame
+}
+
+// GOOD: braces end buf's lifetime before next suspend
+task<> process(stream& s)
+{
+    std::size_t n;
+    {
+        char buf[4096];
+        auto [ec, n_] = co_await s.read_some(buf);
+        n = n_;
+        co_await do_work(buf, n);
+    }
+    co_await s.write_some(reply);  // 4K saved
+}
+----
+
+This technique also enables the compiler to *overlap* variables in the frame. When two variables have completely non-overlapping lifetimes (in separate scoped blocks), the compiler can reuse the same frame memory for both — even on Clang:
+
+[source,cpp]
+----
+// BAD: both arrays in frame simultaneously (8K)
+task<> pipeline(stream& in, stream& out)
+{
+    char read_buf[4096];
+    auto [ec1, n] = co_await in.read_some(read_buf);
+
+    char write_buf[4096];
+    prepare(write_buf, read_buf, n);
+    co_await out.write_some(write_buf);
+}
+
+// GOOD: non-overlapping scopes allow frame reuse (4K)
+task<> pipeline(stream& in, stream& out)
+{
+    std::size_t n;
+    {
+        char read_buf[4096];
+        auto [ec, n_] = co_await in.read_some(read_buf);
+        n = n_;
+    }
+    {
+        char write_buf[4096];
+        prepare(write_buf, n);
+        co_await out.write_some(write_buf);
+    }
+}
+----
+
+In the second version, `read_buf` and `write_buf` never coexist, so the compiler can place them at the same frame offset — halving the frame's buffer footprint. This optimization applies to any variables with non-overlapping lifetimes, not just arrays.
+
+=== GCC vs Clang Frame Sizes
+
+NOTE: This section draws on https://chuanqixu9.github.io/c++/2026/03/27/C++20-Coroutines-from-compiler-and-library-authors-perspective.en.html[C++20 Coroutines from compiler and library authors' perspective] by Chuanqi Xu.
+
+GCC and Clang use fundamentally different strategies for coroutine frame layout:
+
+* **Clang** performs frame layout after middle-end optimizations. Dead variables, unused temporaries, and constant-folded intermediates are eliminated before the frame is sized.
+* **GCC** performs frame layout in the frontend, before optimizations. Every local variable whose scope spans a suspend point ends up in the frame, even if optimizations would later prove it dead.
+
+The practical consequence is that GCC coroutine frames are often 5-10x larger than Clang's for the same source code. In one benchmark, the same coroutine produced a 24-byte frame on Clang and a 16,032-byte frame on GCC.
+
+For production coroutine workloads, Clang currently produces substantially better code. If you must use GCC, pay extra attention to variable scoping (above) and consider supplying a custom `memory_resource` with larger block sizes, since frames above 2048 bytes bypass the default recycling allocator's pooling.
+
 === Profile Before Optimizing
 
 Coroutine frame allocation is rarely the bottleneck. Profile your application before investing in custom allocators.
@@ -175,8 +267,11 @@ Coroutine frame allocation is rarely the bottleneck. Profile your application be
 | `<boost/capy/ex/frame_allocator.hpp>`
 | Frame allocator concept and utilities
 
+| `<boost/capy/ex/frame_alloc_promise.hpp>`
+| Mixin base for promise types that use the TLS frame allocator
+
 | `<boost/capy/ex/recycling_memory_resource.hpp>`
 | Default recycling allocator implementation
 |===
 
-You have now learned how coroutine frame allocation works and how to customize it. This completes the Coroutines in Capy section. Continue to xref:../5.buffers/5a.overview.adoc[Buffer Sequences] to learn about Capy's buffer model.
+You have now learned how coroutine frame allocation works and how to customize it. Continue to xref:4.coroutines/4h.lambda-captures.adoc[Lambda Coroutine Captures] to learn about a critical pitfall with lambda coroutines.
diff --git a/doc/modules/ROOT/pages/4.coroutines/4h.lambda-captures.adoc b/doc/modules/ROOT/pages/4.coroutines/4h.lambda-captures.adoc
new file mode 100644
index 000000000..bbe82a95e
--- /dev/null
+++ b/doc/modules/ROOT/pages/4.coroutines/4h.lambda-captures.adoc
@@ -0,0 +1,209 @@
+= Lambda Coroutine Captures
+
+Lambda captures are a common source of undefined behavior in coroutine code. This section explains the problem and the safe patterns to use instead.
+
+== Prerequisites
+
+* Completed xref:4.coroutines/4g.allocators.adoc[Frame Allocators]
+* Understanding of coroutine frame lifetime from xref:../2.cpp20-coroutines/2c.machinery.adoc[Part III: Coroutine Machinery]
+
+== The Problem
+
+Consider this innocent-looking code:
+
+[source,cpp]
+----
+namespace capy = boost::capy;
+
+void process(socket& sock)
+{
+    auto task = [&sock]() -> capy::task<>
+    {
+        char buf[1024];
+        auto [ec, n] = co_await sock.read_some(buffer(buf, sizeof(buf)));
+    }();
+
+    run_async(executor)(std::move(task));
+}
+----
+
+**This code has undefined behavior.** It may crash, corrupt memory, or appear to work until it doesn't.
+
+== Why It Fails
+
+In {cpp}20, lambda coroutine captures are stored in the lambda closure object, *not* in the coroutine frame. Here is what happens:
+
+1. The lambda closure is created, capturing `sock` by reference
+2. The lambda's `operator()()` is called
+3. A coroutine frame is allocated on the heap
+4. The coroutine suspends at `initial_suspend`
+5. `operator()()` returns the task
+6. **The lambda closure is destroyed** — it was a temporary
+7. Later, the coroutine resumes
+8. The coroutine accesses `sock` through the destroyed closure
+9. **Undefined behavior**
+
+The coroutine frame does not contain a copy of the captured `sock`. It contains a reference to the lambda's capture storage, which no longer exists.
+
+== The Safe Pattern: IIFE With Parameters
+
+The solution is to pass values as **function parameters** instead of **lambda captures**. Function parameters are copied to the coroutine frame.
+
+[source,cpp]
+----
+namespace capy = boost::capy;
+
+void process(socket& sock)
+{
+    auto task = [](socket* s) -> capy::task<>
+    {
+        char buf[1024];
+        auto [ec, n] = co_await s->read_some(buffer(buf, sizeof(buf)));
+    }(&sock);
+
+    run_async(executor)(std::move(task));
+}
+----
+
+This is an **Immediately Invoked Function Expression (IIFE)**. The parameter `s` is copied to the coroutine frame before the first suspension, so it remains valid for the coroutine's lifetime.
+
+== Complete Example
+
+=== Broken: Using Captures
+
+[source,cpp]
+----
+class connection_handler
+{
+    socket sock_;
+    std::string name_;
+
+public:
+    capy::task<> run()
+    {
+        // BROKEN: 'this' captured in lambda, lambda destroyed after invoke
+        return [this]() -> capy::task<>
+        {
+            log("Connection from", name_);  // UB: 'this' is dangling
+            co_await handle_request();
+        }();
+    }
+};
+----
+
+=== Correct: Using Parameters
+
+[source,cpp]
+----
+class connection_handler
+{
+    socket sock_;
+    std::string name_;
+
+public:
+    capy::task<> run()
+    {
+        // CORRECT: 'self' is a parameter, copied to coroutine frame
+        return [](connection_handler* self) -> capy::task<>
+        {
+            log("Connection from", self->name_);
+            co_await self->handle_request();
+        }(this);
+    }
+};
+----
+
+== When Are Captures Safe?
+
+Captures are only safe when the lambda object **outlives the coroutine**:
+
+[source,cpp]
+----
+// SAFE: lambda stored in 'handler', outlives coroutine
+auto handler = [&sock]() -> capy::task<>
+{
+    co_await sock.read_some(...);
+};
+
+// Lambda 'handler' still exists here
+run_and_wait(handler());  // Blocks until coroutine completes
+// Lambda destroyed after coroutine finishes
+----
+
+This pattern is rare. Most async code immediately invokes the lambda and discards it, making captures unsafe.
+
+== Rules of Thumb
+
+1. **Default to IIFE with parameters** for lambda coroutines
+2. **Never capture by reference** (`[&]`) in a lambda coroutine unless the lambda outlives the coroutine
+3. **Capturing by value** (`[=]`, `[x]`) is equally broken — the copy lives in the lambda, not the coroutine frame
+4. **Capturing `this`** is particularly dangerous and common
+5. **When in doubt, use parameters**
+
+== Alternative: Named Coroutine Functions
+
+If the IIFE syntax feels awkward, use a named function instead:
+
+[source,cpp]
+----
+class connection_handler
+{
+    socket sock_;
+
+    capy::task<> do_handle()
+    {
+        // 'this' is an implicit parameter, handled correctly
+        co_await sock_.read_some(...);
+    }
+
+public:
+    capy::task<> run()
+    {
+        return do_handle();
+    }
+};
+----
+
+Member function coroutines work correctly because `this` is an implicit parameter, not a capture. The compiler copies it to the coroutine frame.
+
+== Quick Reference
+
+[cols="2,1,2"]
+|===
+| Pattern | Safety | Notes
+
+| `[x]() -> task<> { use(x); }()`
+| UNSAFE
+| Capture `x` destroyed with lambda
+
+| `[](auto x) -> task<> { use(x); }(val)`
+| SAFE
+| Parameter `x` in coroutine frame
+
+| `[&x]() -> task<> { use(x); }()`
+| UNSAFE
+| Accessed through dangling `this` pointer to destroyed closure
+
+| `[](auto& x) -> task<> { use(x); }(val)`
+| SAFE*
+| Reference parameter; `val` must outlive coroutine
+
+| Member function coroutine
+| SAFE
+| `this` is an implicit parameter
+|===
+
+== Why Does {cpp} Work This Way?
+
+The {cpp} standard specifies that coroutine parameters are copied to the coroutine state, but lambda captures are not. This is because:
+
+* Lambda captures are part of the lambda's closure type
+* The coroutine is the lambda's `operator()`
+* The coroutine frame only stores what is needed for the function body
+* The closure is external to the function body
+
+There have been proposals to change this behavior, but as of {cpp}23 the issue remains.
+
+== Next Steps
+
+You have now learned the major pitfalls of lambda coroutines. This completes the Coroutines in Capy section. Continue to xref:../5.buffers/5a.overview.adoc[Buffer Sequences] to learn about Capy's buffer model.
diff --git a/doc/modules/ROOT/pages/5.buffers/5a.overview.adoc b/doc/modules/ROOT/pages/5.buffers/5a.overview.adoc
index 921de559f..fdf0dfc35 100644
--- a/doc/modules/ROOT/pages/5.buffers/5a.overview.adoc
+++ b/doc/modules/ROOT/pages/5.buffers/5a.overview.adoc
@@ -99,23 +99,6 @@ This single signature accepts:
 * A custom composite type
 * *Any composition of the above—without allocation*
 
-== Zero-Allocation Composition
-
-With concepts, composition creates views, not copies:
-
-[source,cpp]
-----
-HeaderBuffers headers = /* ... */;
-BodyBuffers body = /* ... */;
-
-// cat() creates a view that iterates both sequences
-auto combined = cat(headers, body);  // No allocation!
-
-write_data(combined);  // Works because combined satisfies ConstBufferSequence
-----
-
-The `cat` function returns a lightweight object that, when iterated, first yields buffers from `headers`, then from `body`. The buffers themselves are not copied—only iterators are composed.
-
 == STL Parallel
 
 This design follows Stepanov's insight from the STL: algorithms parameterized on concepts (iterators), not concrete types (containers), enable composition that concrete types forbid.
diff --git a/doc/modules/ROOT/pages/5.buffers/5c.sequences.adoc b/doc/modules/ROOT/pages/5.buffers/5c.sequences.adoc
index 9cec279f5..e4101711b 100644
--- a/doc/modules/ROOT/pages/5.buffers/5c.sequences.adoc
+++ b/doc/modules/ROOT/pages/5.buffers/5c.sequences.adoc
@@ -100,62 +100,47 @@ void process(Buffers const& bufs)
 
 These functions handle both single buffers (returning pointer-to-self) and ranges (returning standard iterators).
 
-== consuming_buffers
+== buffer_slice
 
-When transferring data incrementally, `consuming_buffers` tracks progress:
+When transferring data incrementally, `buffer_slice` returns a slice that tracks progress:
 
 [source,cpp]
 ----
-#include <boost/capy/buffers/consuming_buffers.hpp>
+#include <boost/capy/buffers/buffer_slice.hpp>
 
 template<MutableBufferSequence Buffers>
 task<std::size_t> read_all(Stream& stream, Buffers buffers)
 {
-    consuming_buffers<Buffers> remaining(buffers);
+    auto remaining = buffer_slice(buffers);
+    std::size_t const total_size = buffer_size(buffers);
     std::size_t total = 0;
-    
-    while (buffer_size(remaining) > 0)
+
+    while (total < total_size)
     {
-        auto [ec, n] = co_await stream.read_some(remaining);
-        if (ec.failed())
-            break;
-        remaining.consume(n);
+        auto [ec, n] = co_await stream.read_some(remaining.data());
+        remaining.remove_prefix(n);
         total += n;
+        if (ec)
+            break;
     }
-    
+
     co_return total;
 }
 ----
 
-`consuming_buffers` wraps a buffer sequence and provides:
+`buffer_slice(seq, offset, length)` returns an object of unspecified type that satisfies the `Slice` concept, providing:
 
-* `consume(n)` — Mark `n` bytes as consumed (remove from front)
-* Iteration over unconsumed buffers
-* `buffer_size()` of remaining bytes
+* `data()` — Buffer sequence view of the slice's current bytes (pass to `read_some`/`write_some`)
+* `remove_prefix(n)` — Advance the start by `n` bytes
 
-== Zero-Allocation Composition
-
-The `cat()` function composes buffer sequences without allocation:
-
-[source,cpp]
-----
-auto headers = std::array{header_buf1, header_buf2};
-auto body = body_buffer;
-
-auto combined = cat(headers, body);  // No allocation
-
-// combined satisfies ConstBufferSequence
-// Iteration yields: header_buf1, header_buf2, body_buffer
-----
-
-The returned object stores references (or small copies for single buffers) and iterates through the composed sequence on demand.
+The `offset` and `length` parameters (both optional) make `buffer_slice` a general byte sub-range primitive, not just an iteration-state holder.
 
 == Why Bidirectional?
 
 The concepts require bidirectional ranges (not just forward ranges) for two reasons:
 
 1. Some algorithms traverse buffers backwards
-2. `consuming_buffers` needs to adjust the first buffer's start position
+2. The buffer sequence view returned by `Slice::data()` needs to adjust the first and last buffers' bounds
 
 If your custom buffer sequence only provides forward iteration, wrap it in a type that provides bidirectional access.
 
@@ -168,8 +153,11 @@ If your custom buffer sequence only provides forward iteration, wrap it in a typ
 | `<boost/capy/buffers.hpp>`
 | Concepts and iteration functions
 
-| `<boost/capy/buffers/consuming_buffers.hpp>`
-| Incremental consumption wrapper
+| `<boost/capy/buffers/buffer_slice.hpp>`
+| Byte sub-range slicing algorithm
+
+| `<boost/capy/concept/slice.hpp>`
+| `Slice` concept
 |===
 
 You have now learned how buffer sequences enable zero-allocation composition. Continue to xref:5.buffers/5d.system-io.adoc[System I/O Integration] to see how buffer sequences interface with operating system I/O.
diff --git a/doc/modules/ROOT/pages/5.buffers/5e.algorithms.adoc b/doc/modules/ROOT/pages/5.buffers/5e.algorithms.adoc
index f5c356113..ca169e594 100644
--- a/doc/modules/ROOT/pages/5.buffers/5e.algorithms.adoc
+++ b/doc/modules/ROOT/pages/5.buffers/5e.algorithms.adoc
@@ -151,19 +151,19 @@ The algorithm fills target buffers sequentially, reading from source buffers as
 template<ReadStream Stream, MutableBufferSequence Buffers>
 task<std::size_t> read_full(Stream& stream, Buffers buffers)
 {
-    consuming_buffers<Buffers> remaining(buffers);
+    auto remaining = buffer_slice(buffers);
+    std::size_t const total_size = buffer_size(buffers);
     std::size_t total = 0;
-    
-    while (buffer_size(remaining) > 0)
+
+    while (total < total_size)
     {
-        auto [ec, n] = co_await stream.read_some(remaining);
-        if (ec.failed())
-            co_return total;  // Return partial read on error
-        
-        remaining.consume(n);
+        auto [ec, n] = co_await stream.read_some(remaining.data());
+        remaining.remove_prefix(n);
         total += n;
+        if (ec)
+            co_return total;
     }
-    
+
     co_return total;
 }
 ----
@@ -175,19 +175,19 @@ task<std::size_t> read_full(Stream& stream, Buffers buffers)
 template<WriteStream Stream, ConstBufferSequence Buffers>
 task<std::size_t> write_full(Stream& stream, Buffers buffers)
 {
-    consuming_buffers<Buffers> remaining(buffers);
+    auto remaining = buffer_slice(buffers);
+    std::size_t const total_size = buffer_size(buffers);
     std::size_t total = 0;
-    
-    while (buffer_size(remaining) > 0)
+
+    while (total < total_size)
     {
-        auto [ec, n] = co_await stream.write_some(remaining);
-        if (ec.failed())
-            co_return total;
-        
-        remaining.consume(n);
+        auto [ec, n] = co_await stream.write_some(remaining.data());
+        remaining.remove_prefix(n);
         total += n;
+        if (ec)
+            co_return total;
     }
-    
+
     co_return total;
 }
 ----
@@ -204,7 +204,8 @@ std::string header = build_header();
 std::vector<char> body = load_body();
 
 // No copying—header and body are written directly
-co_await write(stream, cat(make_buffer(header), make_buffer(body)));
+std::array buffers = {make_buffer(header), make_buffer(body)};
+co_await write(stream, buffers);
 ----
 
 === Scatter/Gather Operations
diff --git a/doc/modules/ROOT/pages/5.buffers/5f.dynamic.adoc b/doc/modules/ROOT/pages/5.buffers/5f.dynamic.adoc
index 3c0f8ca70..56e78ffa2 100644
--- a/doc/modules/ROOT/pages/5.buffers/5f.dynamic.adoc
+++ b/doc/modules/ROOT/pages/5.buffers/5f.dynamic.adoc
@@ -76,9 +76,8 @@ task<> read_into_buffer(Stream& stream, DynamicBuffer auto& buffer)
     
     // Read into prepared space
     auto [ec, n] = co_await stream.read_some(space);
-    
-    if (!ec.failed())
-        buffer.commit(n);  // Make data readable
+
+    buffer.commit(n);  // Make data readable
 }
 ----
 
@@ -240,9 +239,9 @@ task<std::string> read_line(Stream& stream)
         // Prepare space and read
         auto space = buffer.prepare(256);
         auto [ec, n] = co_await stream.read_some(space);
-        if (ec.failed())
-            throw std::system_error(ec);
         buffer.commit(n);
+        if (ec)
+            throw std::system_error(ec);
         
         // Search for newline in readable data
         auto data = buffer.data();
diff --git a/doc/modules/ROOT/pages/6.streams/6a.overview.adoc b/doc/modules/ROOT/pages/6.streams/6a.overview.adoc
index fd14790d2..479409463 100644
--- a/doc/modules/ROOT/pages/6.streams/6a.overview.adoc
+++ b/doc/modules/ROOT/pages/6.streams/6a.overview.adoc
@@ -167,10 +167,12 @@ task<> echo(any_stream& stream)
     for (;;)
     {
         auto [ec, n] = co_await stream.read_some(mutable_buffer(buf));
+
+        auto [wec, wn] = co_await write(stream, const_buffer(buf, n));
+
         if (ec)
             co_return;
-        
-        auto [wec, wn] = co_await write(stream, const_buffer(buf, n));
+
         if (wec)
             co_return;
     }
diff --git a/doc/modules/ROOT/pages/6.streams/6b.streams.adoc b/doc/modules/ROOT/pages/6.streams/6b.streams.adoc
index 5e6bc2ae6..44f42a0eb 100644
--- a/doc/modules/ROOT/pages/6.streams/6b.streams.adoc
+++ b/doc/modules/ROOT/pages/6.streams/6b.streams.adoc
@@ -28,13 +28,18 @@ template<MutableBufferSequence Buffers>
 IoAwaitable auto read_some(Buffers buffers);
 ----
 
-Returns an awaitable yielding `(error_code, std::size_t)`:
+Attempts to read up to `buffer_size(buffers)` bytes from the stream into the buffer sequence. Await-returns `(error_code, std::size_t)`:
 
-* On success: `!ec`, and `n >= 1` bytes were read
-* On error: `ec`, and `n == 0`
-* On EOF: `ec == cond::eof`, and `n == 0`
+If `buffer_size(buffers) > 0`:
 
-If `buffer_empty(buffers)` is true, completes immediately with `n == 0` and no error.
+* If `!ec`, then `n >= 1 && n \<= buffer_size(buffers)`. `n` bytes were read into the buffer sequence.
+* If `ec`, then `n >= 0 && n \<= buffer_size(buffers)`. `n` is the number of bytes read before the I/O condition arose.
+
+If `buffer_empty(buffers)` is true, `n` is 0. The empty buffer is not itself a cause for error, but `ec` may reflect the state of the stream.
+
+I/O conditions from the underlying system are reported via `ec`. Failures in the library itself (such as allocation failure) are reported via exceptions.
+
+*Throws:* `std::bad_alloc` if coroutine frame allocation fails.
 
 === Partial Transfer
 
@@ -45,7 +50,7 @@ If `buffer_empty(buffers)` is true, completes immediately with `n == 0` and no e
 char buf[1024];
 auto [ec, n] = co_await stream.read_some(mutable_buffer(buf));
 // n might be 1, might be 500, might be 1024
-// The only guarantee: if !ec && n > 0
+// if !ec, then n >= 1
 ----
 
 This matches underlying OS behavior—reads return when *some* data is available.
@@ -58,18 +63,15 @@ template<ReadStream Stream>
 task<> dump_stream(Stream& stream)
 {
     char buf[256];
-    
+
     for (;;)
     {
         auto [ec, n] = co_await stream.read_some(mutable_buffer(buf));
-        
-        if (ec == cond::eof)
-            break;  // End of stream
-        
-        if (ec)
-            throw std::system_error(ec);
-        
+
         std::cout.write(buf, n);
+
+        if (ec)
+            break;
     }
 }
 ----
@@ -95,12 +97,18 @@ template<ConstBufferSequence Buffers>
 IoAwaitable auto write_some(Buffers buffers);
 ----
 
-Returns an awaitable yielding `(error_code, std::size_t)`:
+Attempts to write up to `buffer_size(buffers)` bytes from the buffer sequence to the stream. Await-returns `(error_code, std::size_t)`:
 
-* On success: `!ec`, and `n >= 1` bytes were written
-* On error: `ec`, and `n == 0`
+If `buffer_size(buffers) > 0`:
 
-If `buffer_empty(buffers)` is true, completes immediately with `n == 0` and no error.
+* If `!ec`, then `n >= 1 && n \<= buffer_size(buffers)`. `n` bytes were written from the buffer sequence.
+* If `ec`, then `n >= 0 && n \<= buffer_size(buffers)`. `n` is the number of bytes written before the I/O condition arose.
+
+If `buffer_empty(buffers)` is true, `n` is 0. The empty buffer is not itself a cause for error, but `ec` may reflect the state of the stream.
+
+I/O conditions from the underlying system are reported via `ec`. Failures in the library itself (such as allocation failure) are reported via exceptions.
+
+*Throws:* `std::bad_alloc` if coroutine frame allocation fails.
 
 === Partial Transfer
 
@@ -191,20 +199,15 @@ task<> handle_connection(any_stream& stream)
     
     for (;;)
     {
-        // Read some data
         auto [ec, n] = co_await stream.read_some(mutable_buffer(buf));
-        
-        if (ec == cond::eof)
-            co_return;  // Client closed connection
-        
-        if (ec)
-            throw std::system_error(ec);
-        
-        // Echo it back
+
         auto [wec, wn] = co_await write(stream, const_buffer(buf, n));
-        
+
+        if (ec)
+            break;
+
         if (wec)
-            throw std::system_error(wec);
+            break;
     }
 }
 ----
diff --git a/doc/modules/ROOT/pages/6.streams/6c.sources-sinks.adoc b/doc/modules/ROOT/pages/6.streams/6c.sources-sinks.adoc
index 73b0627ce..4ea54bfa5 100644
--- a/doc/modules/ROOT/pages/6.streams/6c.sources-sinks.adoc
+++ b/doc/modules/ROOT/pages/6.streams/6c.sources-sinks.adoc
@@ -29,7 +29,7 @@ template<MutableBufferSequence Buffers>
 IoAwaitable auto read(Buffers buffers);
 ----
 
-Returns an awaitable yielding `(error_code, std::size_t)`:
+Await-returns `(error_code, std::size_t)`:
 
 * On success: `!ec`, and `n == buffer_size(buffers)` (buffer completely filled)
 * On EOF: `ec == cond::eof`, and `n` is bytes read before EOF (partial read)
diff --git a/doc/modules/ROOT/pages/6.streams/6d.buffer-concepts.adoc b/doc/modules/ROOT/pages/6.streams/6d.buffer-concepts.adoc
index 7941f7369..3f4af412c 100644
--- a/doc/modules/ROOT/pages/6.streams/6d.buffer-concepts.adoc
+++ b/doc/modules/ROOT/pages/6.streams/6d.buffer-concepts.adoc
@@ -52,11 +52,11 @@ concept BufferSource =
 IoAwaitable auto pull(const_buffer* arr, std::size_t max_count);
 ----
 
-Returns an awaitable yielding `(error_code, std::size_t)`:
+Await-returns `(error_code, std::size_t)`:
 
-* On success: `!ec.failed()`, fills `arr[0..count-1]` with buffer descriptors
+* On success: `!ec`, fills `arr[0..count-1]` with buffer descriptors
 * On exhausted: `count == 0` indicates no more data
-* On error: `ec.failed()`
+* On error: `ec`
 
 The buffers point into the source's internal storage. You must consume all returned data before calling `pull()` again—the previous buffers become invalid.
 
@@ -73,7 +73,7 @@ task<> process_source(Source& source)
     {
         auto [ec, count] = co_await source.pull(bufs, 8);
         
-        if (ec.failed())
+        if (ec)
             throw std::system_error(ec);
         
         if (count == 0)
@@ -228,7 +228,7 @@ task<> decompress_stream(any_buffer_source& compressed, any_write_sink& output)
     for (;;)
     {
         auto [ec, count] = co_await compressed.pull(bufs, 8);
-        if (ec.failed())
+        if (ec)
             throw std::system_error(ec);
         if (count == 0)
             break;
diff --git a/doc/modules/ROOT/pages/6.streams/6e.algorithms.adoc b/doc/modules/ROOT/pages/6.streams/6e.algorithms.adoc
index e8340e7e5..7692211fa 100644
--- a/doc/modules/ROOT/pages/6.streams/6e.algorithms.adoc
+++ b/doc/modules/ROOT/pages/6.streams/6e.algorithms.adoc
@@ -224,7 +224,7 @@ if (ec == cond::eof)
     // Normal completion
     std::cout << "Transferred " << total << " bytes\n";
 }
-else if (ec.failed())
+else if (ec)
 {
     // Error occurred
     std::cerr << "Error after " << total << " bytes: " << ec.message() << "\n";
diff --git a/doc/modules/ROOT/pages/6.streams/6f.isolation.adoc b/doc/modules/ROOT/pages/6.streams/6f.isolation.adoc
index a519334b6..ff2218db2 100644
--- a/doc/modules/ROOT/pages/6.streams/6f.isolation.adoc
+++ b/doc/modules/ROOT/pages/6.streams/6f.isolation.adoc
@@ -38,7 +38,7 @@ task<> handle_protocol(any_stream& stream)
     for (;;)
     {
         auto [ec, n] = co_await stream.read_some(mutable_buffer(buf));
-        if (ec.failed())
+        if (ec)
             co_return;
         
         // Process and respond...
@@ -237,4 +237,4 @@ Type-erased wrappers are in `<boost/capy/io/>`:
 * `any_read_source`, `any_write_sink`
 * `any_buffer_source`, `any_buffer_sink`
 
-You have now completed the Stream Concepts section. These abstractions—streams, sources, sinks, and their type-erased wrappers—form the foundation for Capy's I/O model. Continue to xref:../7.examples/7a.hello-task.adoc[Example Programs] to see complete working examples.
+You have now completed the Stream Concepts section. These abstractions—streams, sources, sinks, and their type-erased wrappers—form the foundation for Capy's I/O model. Continue to xref:../8.examples/8a.hello-task.adoc[Example Programs] to see complete working examples.
diff --git a/doc/modules/ROOT/pages/7.examples/7i.echo-server-corosio.adoc b/doc/modules/ROOT/pages/7.examples/7i.echo-server-corosio.adoc
deleted file mode 100644
index 873afdad8..000000000
--- a/doc/modules/ROOT/pages/7.examples/7i.echo-server-corosio.adoc
+++ /dev/null
@@ -1,242 +0,0 @@
-= Echo Server with Corosio
-
-A complete echo server using Corosio for real network I/O.
-
-== What You Will Learn
-
-* Integrating Capy with Corosio networking
-* Accepting TCP connections
-* Handling multiple clients concurrently
-
-== Prerequisites
-
-* Completed xref:7.examples/7h.custom-dynamic-buffer.adoc[Custom Dynamic Buffer]
-* Corosio library installed
-* Understanding of TCP networking basics
-
-== Source Code
-
-[source,cpp]
-----
-#include <boost/capy.hpp>
-#include <boost/corosio.hpp>
-#include <iostream>
-
-using namespace boost::capy;
-namespace tcp = boost::corosio::tcp;
-
-// Echo handler: receives data and sends it back
-task<> echo_session(any_stream& stream, std::string client_info)
-{
-    std::cout << "[" << client_info << "] Session started\n";
-    
-    char buffer[1024];
-    std::size_t total_bytes = 0;
-    
-    try
-    {
-        for (;;)
-        {
-            // Read some data
-            auto [ec, n] = co_await stream.read_some(mutable_buffer(buffer));
-            
-            if (ec == cond::eof)
-            {
-                std::cout << "[" << client_info << "] Client disconnected\n";
-                break;
-            }
-            
-            if (ec.failed())
-            {
-                std::cout << "[" << client_info << "] Read error: " 
-                          << ec.message() << "\n";
-                break;
-            }
-            
-            total_bytes += n;
-            
-            // Echo it back
-            auto [wec, wn] = co_await write(stream, const_buffer(buffer, n));
-            
-            if (wec.failed())
-            {
-                std::cout << "[" << client_info << "] Write error: " 
-                          << wec.message() << "\n";
-                break;
-            }
-        }
-    }
-    catch (std::exception const& e)
-    {
-        std::cout << "[" << client_info << "] Exception: " << e.what() << "\n";
-    }
-    
-    std::cout << "[" << client_info << "] Session ended, "
-              << total_bytes << " bytes echoed\n";
-}
-
-// Accept loop: accepts connections and spawns handlers
-task<> accept_loop(tcp::acceptor& acceptor, executor_ref ex)
-{
-    std::cout << "Server listening on port " 
-              << acceptor.local_endpoint().port() << "\n";
-    
-    int connection_id = 0;
-    
-    for (;;)
-    {
-        // Accept a connection
-        auto [ec, socket] = co_await acceptor.async_accept();
-        
-        if (ec.failed())
-        {
-            std::cout << "Accept error: " << ec.message() << "\n";
-            continue;
-        }
-        
-        // Build client info string
-        auto remote = socket.remote_endpoint();
-        std::string client_info = 
-            std::to_string(++connection_id) + ":" +
-            remote.address().to_string() + ":" +
-            std::to_string(remote.port());
-        
-        std::cout << "[" << client_info << "] Connection accepted\n";
-        
-        // Wrap socket and spawn handler
-        // Note: socket ownership transfers to the lambda
-        run_async(ex)(
-            [](tcp::socket sock, std::string info) -> task<> {
-                any_stream stream{sock};
-                co_await echo_session(stream, std::move(info));
-            }(std::move(socket), std::move(client_info))
-        );
-    }
-}
-
-int main(int argc, char* argv[])
-{
-    try
-    {
-        // Parse port from command line
-        unsigned short port = 8080;
-        if (argc > 1)
-            port = static_cast<unsigned short>(std::stoi(argv[1]));
-        
-        // Create I/O context and thread pool
-        boost::corosio::io_context ioc;
-        thread_pool pool(4);
-        
-        // Create acceptor
-        tcp::endpoint endpoint(tcp::v4(), port);
-        tcp::acceptor acceptor(ioc, endpoint);
-        acceptor.set_option(tcp::acceptor::reuse_address(true));
-        
-        std::cout << "Starting echo server...\n";
-        
-        // Run accept loop
-        run_async(pool.get_executor())(
-            accept_loop(acceptor, pool.get_executor())
-        );
-        
-        // Run the I/O context (this blocks)
-        ioc.run();
-    }
-    catch (std::exception const& e)
-    {
-        std::cerr << "Error: " << e.what() << "\n";
-        return 1;
-    }
-    
-    return 0;
-}
-----
-
-== Build
-
-[source,cmake]
-----
-find_package(Corosio REQUIRED)
-
-add_executable(echo_server echo_server.cpp)
-target_link_libraries(echo_server PRIVATE capy Corosio::corosio)
-----
-
-== Walkthrough
-
-=== TCP Acceptor
-
-[source,cpp]
-----
-tcp::endpoint endpoint(tcp::v4(), port);
-tcp::acceptor acceptor(ioc, endpoint);
-----
-
-The acceptor listens for incoming connections on the specified port.
-
-=== Accept Loop
-
-[source,cpp]
-----
-for (;;)
-{
-    auto [ec, socket] = co_await acceptor.async_accept();
-    // ... handle connection ...
-}
-----
-
-The accept loop runs forever, accepting connections and spawning handlers. Each connection runs in its own task.
-
-=== Type Erasure
-
-[source,cpp]
-----
-any_stream stream{sock};
-co_await echo_session(stream, std::move(info));
-----
-
-The `echo_session` function accepts `any_stream&`. The concrete `tcp::socket` is wrapped at the call site. This keeps the echo logic transport-independent.
-
-=== Concurrent Clients
-
-Each client connection spawns a new task via `run_async`. Multiple clients are handled concurrently on the thread pool.
-
-== Testing
-
-Start the server:
-
-----
-$ ./echo_server 8080
-Starting echo server...
-Server listening on port 8080
-----
-
-Connect with netcat:
-
-----
-$ nc localhost 8080
-Hello
-Hello
-World
-World
-^C
-----
-
-Server output:
-
-----
-[1:127.0.0.1:54321] Connection accepted
-[1:127.0.0.1:54321] Session started
-[1:127.0.0.1:54321] Client disconnected
-[1:127.0.0.1:54321] Session ended, 12 bytes echoed
-----
-
-== Exercises
-
-1. Add a connection limit with graceful rejection
-2. Implement a simple command protocol (e.g., ECHO, QUIT, STATS)
-3. Add TLS support using Corosio's TLS streams
-
-== Next Steps
-
-* xref:7.examples/7j.stream-pipeline.adoc[Stream Pipeline] — Data transformation chains
diff --git a/doc/modules/ROOT/pages/7.testing/7.intro.adoc b/doc/modules/ROOT/pages/7.testing/7.intro.adoc
new file mode 100644
index 000000000..5be820318
--- /dev/null
+++ b/doc/modules/ROOT/pages/7.testing/7.intro.adoc
@@ -0,0 +1,66 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+= Testing
+
+Real I/O is a poor foundation for unit tests. Network operations are slow,
+non-deterministic, and do not fail on demand -- so error-handling paths go
+untested until production breaks them. Capy ships a self-contained toolkit
+that replaces the transport with in-memory mocks, drives coroutines to
+completion on the calling thread, and injects failures at every
+`maybe_fail()` site so that every error branch is exercised automatically.
+Because each mock satisfies the same concept as its production counterpart,
+test code reads the same as production code -- the only difference is the
+type of the stream or source you pass in.
+
+== What This Section Covers
+
+* xref:7.testing/7a.drivers.adoc[Driving Tests] -- `run_blocking` drives a
+  coroutine to completion on the calling thread without a real executor;
+  `fuse` runs the test body repeatedly, injecting an error at each
+  `maybe_fail()` site in turn until every failure path has been covered;
+  and the `thread_name` header's `set_current_thread_name` function labels
+  worker threads so that failures in multi-threaded tests are easier to
+  attribute.
+
+* xref:7.testing/7b.mock-streams.adoc[Mock Streams] -- `read_stream`,
+  `write_stream`, and `stream` (a connected pair) implement the partial-I/O
+  concepts from xref:6.streams/6b.streams.adoc[Streams]. Use them to test
+  protocol logic that calls `read_some` and `write_some` without touching a
+  socket.
+
+* xref:7.testing/7c.mock-sources-sinks.adoc[Mock Sources and Sinks] --
+  `read_source` and `write_sink` implement the complete-I/O concepts from
+  xref:6.streams/6c.sources-sinks.adoc[Sources and Sinks]. Unlike the
+  stream mocks, they loop internally until the buffer is fully filled or
+  drained, and `write_sink` accepts an explicit EOF signal.
+
+* xref:7.testing/7d.mock-buffer-concepts.adoc[Mock Buffer Sources and Sinks]
+  -- `buffer_source` and `buffer_sink` implement the buffer concepts from
+  xref:6.streams/6d.buffer-concepts.adoc[Buffer Sources and Sinks].
+  `buffer_source` exposes staged bytes via a pull interface;
+  `buffer_sink` provides callee-owned storage that the algorithm writes
+  into directly.
+
+* xref:7.testing/7e.buffer-inspection.adoc[Buffer Inspection] -- `bufgrind`
+  iterates every split point of a buffer sequence, exercising every
+  chunk-boundary condition; `buffer_to_string` concatenates buffer sequences
+  into a `std::string` for easy assertion.
+
+== How the Pieces Fit
+
+A typical test constructs one or more mocks, arms a `fuse`, and hands the
+mocks to the code under test inside a `run_blocking` call. The `fuse`
+repeats the test body automatically -- once for each failure site and once
+in exception mode -- while `run_blocking` keeps the whole thing on the
+calling thread. Buffer utilities such as `bufgrind` and `buffer_to_string`
+wrap the mock data for assertions, letting you verify that every split of
+an input buffer produces the same correct output.
+
+Continue to xref:7.testing/7a.drivers.adoc[Driving Tests] to begin.
diff --git a/doc/modules/ROOT/pages/7.testing/7a.drivers.adoc b/doc/modules/ROOT/pages/7.testing/7a.drivers.adoc
new file mode 100644
index 000000000..bd440f257
--- /dev/null
+++ b/doc/modules/ROOT/pages/7.testing/7a.drivers.adoc
@@ -0,0 +1,463 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+= Driving Tests
+
+Three utilities work together to run capy code synchronously inside a unit
+test: `run_blocking` drives a coroutine to completion on the calling thread,
+`fuse` injects errors at controlled points and reruns the test body until
+every failure path is covered, and `set_current_thread_name` labels worker
+threads so that multi-threaded test output is readable.
+
+== Prerequisites
+
+* xref:4.coroutines/4.intro.adoc[Coroutines in Capy]
+* xref:6.streams/6.intro.adoc[Stream Concepts]
+
+== run_blocking
+
+`run_blocking` bridges async coroutine code into a synchronous test body.
+It creates a single-threaded event loop on the calling thread, launches the
+coroutine through it, and blocks until the coroutine finishes or throws.
+No real executor or thread pool is involved.
+
+[source,cpp]
+----
+#include <boost/capy/task.hpp>
+#include <boost/capy/test/run_blocking.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+task<int> compute(int x)
+{
+    co_return x * 2;
+}
+
+void test_compute()
+{
+    int result = 0;
+    run_blocking([&](int v) { result = v; })(compute(21));
+    BOOST_TEST(result == 42);
+}
+----
+
+=== Result Capture
+
+Call `run_blocking` with a lambda to capture the result. The lambda
+receives the coroutine's return value on success. Separate lambdas can
+handle the success and error cases independently:
+
+[source,cpp]
+----
+// Discard result; rethrow on exception
+run_blocking()(my_task());
+
+// Capture result; rethrow on exception
+int out = 0;
+run_blocking([&](int v) { out = v; })(compute(21));
+
+// Capture result; handle exception separately
+run_blocking(
+    [&](int v) { out = v; },
+    [](std::exception_ptr ep) { std::rethrow_exception(ep); }
+)(compute(21));
+
+// With a stop token (discards result)
+std::stop_source src;
+run_blocking(src.get_token())(my_task());
+
+// With a stop token and a result handler
+run_blocking(src.get_token(), [&](int v) { out = v; })(compute(21));
+
+// With a stop token and separate handlers
+run_blocking(
+    src.get_token(),
+    [&](int v) { out = v; },
+    [](std::exception_ptr ep) { std::rethrow_exception(ep); }
+)(compute(21));
+----
+
+=== How It Works
+
+`run_blocking` creates a `blocking_context`, an internal single-threaded
+execution context. Work posted to it is queued and processed on the calling
+thread until the coroutine signals completion, then control returns to the
+caller. The inline executor performs symmetric transfer for `dispatch` calls
+so that the coroutine chain runs without unnecessary context switches.
+Use this only in test code. Production code should use a real execution
+context such as a thread pool.
+
+[cols="1,2"]
+|===
+| Overload | Behavior
+
+| `run_blocking()`
+| Discard result. Rethrows captured exceptions.
+
+| `run_blocking(on_value)`
+| Invoke `on_value(v)` on success. Rethrows exceptions if `on_value`
+  does not accept `std::exception_ptr`.
+
+| `run_blocking(on_value, on_error)`
+| Invoke `on_value(v)` on success or `on_error(ep)` with
+  `std::exception_ptr` on failure.
+
+| `run_blocking(stop_token)`
+| Drive with an external stop token; discard result.
+
+| `run_blocking(stop_token, on_value)`
+| Drive with an external stop token; invoke `on_value(v)` on success.
+
+| `run_blocking(stop_token, on_value, on_error)`
+| Drive with an external stop token; invoke `on_value(v)` on success or
+  `on_error(ep)` on failure.
+|===
+
+== fuse
+
+`fuse` tests every error-handling path in a coroutine by injecting failures
+systematically. Each call to `maybe_fail()` is a potential failure point.
+The returned `result` converts to `bool` and, on failure, carries the source
+location of the failing call.
+
+[source,cpp]
+----
+#include <boost/capy/test/fuse.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+void test_with_fuse()
+{
+    fuse f;
+    auto r = f.armed([](fuse& f) {
+        auto ec = f.maybe_fail();
+        if(ec)
+            return;  // injected error: exit gracefully
+
+        ec = f.maybe_fail();
+        if(ec)
+            return;
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+=== armed() vs. inert()
+
+`armed()` runs the test body in two full passes (error-code mode, then
+exception mode) and is the normal choice for exhaustive error coverage.
+
+`inert()` runs the test body exactly once with no injection. Calls to
+`maybe_fail()` always return an empty error code and never throw.
+
+Use `inert()` for happy-path verification ("does this work when nothing
+fails?"). Use `armed()` for fault-tolerance verification ("does this
+handle a failure at every async step?"). A typical test suite pairs
+both -- `inert()` confirms the function works at all, then `armed()`
+confirms it handles every error site:
+
+[source,cpp]
+----
+fuse f;
+
+// Smoke test: happy path
+auto r1 = f.inert([&](fuse&) -> task<void> {
+    read_stream rs(f);
+    rs.provide("hello");
+
+    char buf[8];
+    auto [ec, n] = co_await rs.read_some(make_buffer(buf));
+    BOOST_TEST(!ec);
+    BOOST_TEST(std::string_view(buf, n) == "hello");
+});
+BOOST_TEST(r1.success);
+
+// Fault coverage: every error site
+auto r2 = f.armed([&](fuse&) -> task<void> {
+    read_stream rs(f);
+    rs.provide("hello");
+
+    char buf[8];
+    auto [ec, n] = co_await rs.read_some(make_buffer(buf));
+    if(ec)
+        co_return;  // fuse injected an error; exit gracefully
+    BOOST_TEST(std::string_view(buf, n) == "hello");
+});
+BOOST_TEST(r2.success);
+----
+
+The only difference is the `if(ec) co_return;` guard. In `inert()`,
+that guard is dead code (`maybe_fail()` never returns an error); in
+`armed()`, it is essential.
+
+The only way to signal a test failure under `inert()` is to call
+`f.fail()` from inside the body:
+
+[source,cpp]
+----
+fuse f;
+auto r = f.inert([](fuse& f) {
+    auto ec = f.maybe_fail();  // always returns {}
+    assert(!ec);
+
+    if(some_condition_failed)
+        f.fail();  // the only way to signal failure in inert mode
+});
+BOOST_TEST(r.success);
+----
+
+=== The Early-Return Pattern
+
+When `armed()` injects an error, the test body receives it from
+`maybe_fail()`. The body must exit immediately rather than continuing
+as though the operation succeeded. The mock streams also call `maybe_fail()`
+internally, so this pattern applies to all I/O calls inside an armed test.
+
+[source,cpp]
+----
+// Correct: early return on injected error
+auto [ec, n] = co_await rs.read_some(buf);
+if(ec)
+    co_return;  // fuse injected an error -- exit gracefully
+
+// Wrong: asserting success unconditionally
+auto [ec, n] = co_await rs.read_some(buf);
+BOOST_TEST(!ec);  // fails when fuse injects an error
+----
+
+=== Coroutine Support
+
+`armed()` detects when the test lambda returns an `IoRunnable` (such as
+`task<void>`) and drives it to completion via `run_blocking` internally.
+You do not need to call `run_blocking` yourself:
+
+[source,cpp]
+----
+fuse f;
+auto r = f.armed([&](fuse&) -> task<void> {
+    auto ec = f.maybe_fail();
+    if(ec)
+        co_return;
+
+    auto [ec2, n] = co_await rs.read_some(buf);
+    if(ec2)
+        co_return;
+});
+BOOST_TEST(r.success);
+----
+
+=== Custom Fail Points
+
+A type that holds a `fuse` reference can call `maybe_fail()` from its own
+methods to declare additional fail points beyond those built into the
+mocks. Outside `armed()` or `inert()` the call is a no-op (returns an
+empty error code immediately); inside `armed()` it participates in
+fault injection alongside every other site.
+
+[source,cpp]
+----
+class widget
+{
+    fuse& f_;
+public:
+    explicit widget(fuse& f) : f_(f) {}
+
+    std::error_code process()
+    {
+        auto ec = f_.maybe_fail();
+        if(ec)
+            return ec;
+        // ... actual work ...
+        return {};
+    }
+};
+
+fuse f;
+widget w(f);
+w.process();                                    // maybe_fail() returns {}
+
+auto r = f.armed([&](fuse&) { w.process(); });  // both branches exercised
+BOOST_TEST(r.success);
+----
+
+=== Custom Error Code
+
+The default injected code is `error::test_failure`. Pass any
+`std::error_code` to the constructor to change it:
+
+[source,cpp]
+----
+fuse f(std::make_error_code(std::errc::operation_canceled));
+auto r = f.armed([](fuse& f) {
+    auto ec = f.maybe_fail();
+    if(ec)
+    {
+        assert(ec == std::errc::operation_canceled);
+        return;
+    }
+});
+BOOST_TEST(r.success);
+----
+
+[cols="1,2"]
+|===
+| Member | Description
+
+| `fuse()`
+| Construct with the default error code (`error::test_failure`).
+
+| `explicit fuse(std::error_code ec)`
+| Construct with a custom error code delivered by `maybe_fail()`.
+
+| `armed(fn) -> result`
+| Run `fn` repeatedly in error-code mode then exception mode, failing
+  at successive `maybe_fail()` sites. Accepts plain lambdas and coroutine
+  lambdas returning `IoRunnable`.
+
+| `inert(fn) -> result`
+| Run `fn` once with no injection. `maybe_fail()` always returns `{}`.
+  Accepts plain lambdas and coroutine lambdas returning `IoRunnable`.
+
+| `operator()(fn) -> result`
+| Alias for `armed(fn)`.
+
+| `maybe_fail() -> std::error_code`
+| Return the injected error code at the active failure point, or `{}`
+  otherwise. In exception mode, throws `std::system_error` instead of
+  returning an error. Outside `armed`/`inert`, always returns `{}`.
+
+| `fail()`
+| Signal an explicit test failure and stop execution. Records the call
+  site in `result::loc`.
+
+| `fail(std::exception_ptr)`
+| Signal a test failure with an associated exception. Stored in
+  `result::ep`.
+
+| `result::success`
+| `true` if the run completed without any failure.
+
+| `result::loc`
+| Source location of the last `maybe_fail()` or `fail()` call on failure.
+
+| `result::ep`
+| Exception pointer captured from a `fail(ep)` call, or `nullptr`.
+
+| `result::operator bool()`
+| Returns `result::success`.
+|===
+
+== thread_name
+
+`set_current_thread_name` names the calling thread so that debuggers,
+`htop`, and core dumps show a recognizable label instead of a generic thread
+ID. This is most useful when a test failure occurs inside a thread pool
+worker and you need to identify which worker was involved. The function is a
+no-op on platforms without thread-naming support.
+
+Platform limits on the name length:
+
+* Linux, FreeBSD, NetBSD: 15 characters
+* macOS: 63 characters
+* Windows: no practical limit
+
+[source,cpp]
+----
+#include <boost/capy/ex/run_async.hpp>
+#include <boost/capy/ex/thread_pool.hpp>
+#include <boost/capy/task.hpp>
+#include <boost/capy/test/thread_name.hpp>
+
+using namespace boost::capy;
+
+thread_pool pool(4);
+run_async(pool.get_executor())([]() -> task<void> {
+    set_current_thread_name("test-worker-0");
+    // ... test work runs here; name appears in gdb thread list
+    co_return;
+}());
+pool.join();
+----
+
+Note that `set_current_thread_name` lives in namespace `boost::capy`, not
+`boost::capy::test`, because the function is useful in any context, not only
+tests.
+
+[cols="1,2"]
+|===
+| Function | Description
+
+| `set_current_thread_name(char const* name)`
+| Set the OS thread name for the calling thread. Truncated to the
+  platform limit. No-op on unsupported platforms.
+|===
+
+== Putting It Together
+
+The canonical test skeleton combines a small coroutine and `fuse.armed()`.
+The coroutine overload of `armed()` drives the task itself via `run_blocking`
+internally, so the test body uses `co_await` directly:
+
+[source,cpp]
+----
+#include <boost/capy/task.hpp>
+#include <boost/capy/test/fuse.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+task<int> add(int a, int b)
+{
+    co_return a + b;
+}
+
+void test_add()
+{
+    fuse f;
+    auto r = f.armed([&](fuse&) -> task<void> {
+        auto sum = co_await add(3, 4);
+        BOOST_TEST(sum == 7);
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+=== Shared State Across Copies
+
+`fuse` is a value type backed by a `std::shared_ptr<state>`. Every copy
+of a `fuse` object shares the same internal state, so all copies respond
+to the same `armed()` or `inert()` call. This is what makes the canonical
+pattern work: pass a copy of `f` to each mock at construction time, then
+call `f.armed(...)` once -- the injection machinery reaches every mock
+because they all hold a copy pointing to the same shared state.
+
+For tests that need mocks, replace `add` with a function that takes a
+`read_stream`, `write_stream`, or other mock, and construct those mocks
+with the same `fuse f`. The armed loop will then exercise every I/O
+failure path through both error-code and exception modes automatically.
+
+== Reference
+
+[cols="1,3"]
+|===
+| Header | Contents
+
+| `<boost/capy/test/run_blocking.hpp>`
+| Synchronous coroutine driver.
+
+| `<boost/capy/test/fuse.hpp>`
+| Systematic error injection.
+
+| `<boost/capy/test/thread_name.hpp>`
+| Thread naming for diagnostics.
+|===
+
+Continue to xref:7.testing/7b.mock-streams.adoc[Mock Streams].
diff --git a/doc/modules/ROOT/pages/7.testing/7b.mock-streams.adoc b/doc/modules/ROOT/pages/7.testing/7b.mock-streams.adoc
new file mode 100644
index 000000000..dd3ed912c
--- /dev/null
+++ b/doc/modules/ROOT/pages/7.testing/7b.mock-streams.adoc
@@ -0,0 +1,428 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+= Mock Streams
+
+Concept-conforming test doubles for the partial-I/O concepts in
+xref:6.streams/6b.streams.adoc[Streams]. Use them to drive protocol
+code without real network I/O, with optional chunking to exercise
+partial-transfer paths.
+
+== read_stream
+
+`read_stream` implements the `ReadStream` concept. Test code stages bytes
+via `provide()`, then the system under test (or the test body) calls
+`read_some()` to consume them. The attached `fuse` injects errors at
+every read call, exercising the caller's error-handling paths. Because
+`fuse` copies share state (see
+xref:7.testing/7a.drivers.adoc#_shared_state_across_copies[Shared State Across Copies]),
+constructing `read_stream rs(f)` by value still ties `rs` to the same
+fail-point machinery as `f`.
+
+[source,cpp]
+----
+#include <boost/capy/test/read_stream.hpp>
+#include <boost/capy/test/fuse.hpp>
+#include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/task.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+void test_read_stream()
+{
+    fuse f;
+    read_stream rs(f);
+    rs.provide("Hello, ");
+    rs.provide("World!");
+
+    auto r = f.armed([&](fuse&) -> task<void> {
+        char buf[32];
+        auto [ec, n] = co_await rs.read_some(
+            mutable_buffer(buf, sizeof(buf)));
+        if(ec)
+            co_return;
+        BOOST_TEST(std::string_view(buf, n) == "Hello, World!");
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+=== Chunked Delivery
+
+Passing a `max_read_size` to the constructor limits how many bytes
+`read_some` returns per call. Use this to simulate a network that
+delivers data in small pieces and verify your protocol code loops
+correctly on partial reads.
+
+[source,cpp]
+----
+// At most 4 bytes per read_some call
+fuse f;
+read_stream rs(f, 4);
+rs.provide("Hello, World!");
+
+auto r = f.armed([&](fuse&) -> task<void> {
+    char buf[32];
+    auto [ec, n] = co_await rs.read_some(
+        mutable_buffer(buf, sizeof(buf)));
+    if(ec)
+        co_return;
+    BOOST_TEST(n == 4);  // "Hell"
+});
+BOOST_TEST(r.success);
+----
+
+=== EOF Behavior
+
+When all provided data has been consumed, `read_some` returns
+`cond::eof` with a byte count of zero. The stream does not
+suspend; the result is available immediately.
+
+[source,cpp]
+----
+fuse f;
+read_stream rs(f);
+rs.provide("hi");
+
+auto r = f.inert([&](fuse&) -> task<void> {
+    char buf[8];
+    // First read: consumes "hi"
+    auto [ec, n] = co_await rs.read_some(
+        mutable_buffer(buf, sizeof(buf)));
+    BOOST_TEST(!ec);
+    BOOST_TEST(std::string_view(buf, n) == "hi");
+
+    // Second read: EOF
+    auto [ec2, n2] = co_await rs.read_some(
+        mutable_buffer(buf, sizeof(buf)));
+    BOOST_TEST(ec2 == cond::eof);
+    BOOST_TEST(n2 == 0);
+});
+BOOST_TEST(r.success);
+----
+
+[cols="1,2"]
+|===
+| Member | Description
+
+| `explicit read_stream(fuse f = {}, std::size_t max_read_size = std::size_t(-1))`
+| Construct with an optional shared `fuse` and an optional per-read byte limit.
+  When omitted, the fuse is inert and reads return all available data at once.
+  Set `max_read_size` to simulate chunked network delivery.
+
+| `provide(std::string_view sv)`
+| Append bytes to the internal buffer for subsequent reads. Multiple
+  calls accumulate data.
+
+| `read_some(MutableBufferSequence buffers)`
+| Partial read. Returns up to `max_read_size` bytes (or all available
+  if no limit was set). Returns `cond::eof` when the buffer is drained.
+  Consults the fuse before every read.
+
+| `available() -> std::size_t`
+| Return the number of bytes remaining to be read.
+
+| `clear()`
+| Clear all data and reset the read position.
+|===
+
+== write_stream
+
+`write_stream` implements the `WriteStream` concept. The system under
+test calls `write_some()` and the test inspects what was written via
+`data()`. Test code may also call `expect()` to register the data it
+anticipates; any mismatch between written bytes and that prefix causes
+`write_some()` to return `error::test_failure` directly. The fuse is a
+separate concern used only for error injection. Because `fuse` copies
+share state (see
+xref:7.testing/7a.drivers.adoc#_shared_state_across_copies[Shared State Across Copies]),
+constructing `write_stream ws(f)` by value still ties `ws` to the same
+fail-point machinery as `f`.
+
+[source,cpp]
+----
+#include <boost/capy/test/write_stream.hpp>
+#include <boost/capy/test/fuse.hpp>
+#include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/task.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+void test_write_stream()
+{
+    fuse f;
+    write_stream ws(f);
+
+    auto r = f.armed([&](fuse&) -> task<void> {
+        auto [ec, n] = co_await ws.write_some(
+            const_buffer("Hello", 5));
+        if(ec)
+            co_return;
+        BOOST_TEST(ws.data() == "Hello");
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+=== Chunked Writes
+
+Passing a `max_write_size` to the constructor limits how many bytes
+`write_some` accepts per call, simulating a slow consumer. Use this
+to verify that your code loops correctly until all data is transferred.
+
+[source,cpp]
+----
+fuse f;
+write_stream ws(f, 4);  // accept at most 4 bytes per call
+
+auto r = f.inert([&](fuse&) -> task<void> {
+    auto [ec, n] = co_await ws.write_some(
+        const_buffer("Hello", 5));
+    BOOST_TEST(!ec);
+    BOOST_TEST(n == 4);  // only "Hell" was accepted
+});
+BOOST_TEST(r.success);
+----
+
+=== Expected Data Verification
+
+Call `expect()` before or after writes to assert that the written data
+matches a prefix. Matched bytes are consumed from both sides. If written
+data does not match the expected prefix, the next `write_some` call
+returns `error::test_failure`.
+
+[source,cpp]
+----
+fuse f;
+write_stream ws(f);
+ws.expect("Hello World");
+
+auto r = f.inert([&](fuse&) -> task<void> {
+    // Writing matching data succeeds
+    auto [ec, n] = co_await ws.write_some(
+        const_buffer("Hello World", 11));
+    BOOST_TEST(!ec);
+});
+BOOST_TEST(r.success);
+----
+
+[cols="1,2"]
+|===
+| Member | Description
+
+| `explicit write_stream(fuse f = {}, std::size_t max_write_size = std::size_t(-1))`
+| Construct with an optional shared `fuse` and an optional per-write byte limit.
+  When omitted, the fuse is inert and writes accept all bytes at once.
+  Set `max_write_size` to simulate chunked network delivery.
+
+| `write_some(ConstBufferSequence buffers)`
+| Partial write. Appends up to `max_write_size` bytes to the internal
+  buffer, then checks against the expected prefix. On mismatch, rolls
+  back the appended bytes and returns `(error::test_failure, 0)`.
+  Consults the fuse before every write.
+
+| `data() -> std::string_view`
+| Return bytes written but not yet matched by `expect()`.
+
+| `size() -> std::size_t`
+| Return the number of bytes written.
+
+| `expect(std::string_view sv) -> std::error_code`
+| Register expected data and immediately check any already-written
+  bytes. Returns an error if existing data does not match.
+|===
+
+== stream
+
+`stream` is a connected bidirectional test double. Create a pair with
+`make_stream_pair(f)`. Bytes written to one end become readable on the
+other. If `read_some` is called on an end with no buffered data, the
+calling coroutine suspends until the peer calls `write_some`. This
+makes `stream` useful for testing client/server code without real
+sockets.
+
+Both `stream` ends satisfy `ReadStream` and `WriteStream`.
+
+[source,cpp]
+----
+#include <boost/capy/test/stream.hpp>
+#include <boost/capy/test/fuse.hpp>
+#include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/task.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+void test_stream_pair()
+{
+    fuse f;
+    auto [a, b] = make_stream_pair(f);
+
+    auto r = f.armed([&](fuse&) -> task<void> {
+        auto [ec, n] = co_await a.write_some(
+            const_buffer("hello", 5));
+        if(ec)
+            co_return;
+
+        char buf[32];
+        auto [ec2, n2] = co_await b.read_some(
+            mutable_buffer(buf, sizeof(buf)));
+        if(ec2)
+            co_return;
+        BOOST_TEST(std::string_view(buf, n2) == "hello");
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+=== Connected Semantics
+
+Data written to `a` goes into `b`'s incoming buffer, and vice versa.
+`write_some` completes immediately and posts any suspended peer reader
+before returning. If `b.read_some()` is called when `a` has not yet
+written anything, the coroutine suspends; it resumes the moment `a`
+calls `write_some`.
+
+The `provide()` member is a shortcut that injects bytes directly into
+the peer's incoming buffer, bypassing the fuse. Use it during test
+setup when you want to pre-populate data without going through an
+operation under test.
+
+=== EOF and Cross-End Closure
+
+Calling `close()` on one end signals EOF to the peer. The peer drains
+any buffered data first; once the buffer is empty, subsequent
+`read_some` calls on the peer return `cond::eof`. The peer may still
+call `write_some` after receiving EOF.
+
+When the fuse injects an error during `read_some` or `write_some`, the
+pair is automatically closed: the calling end returns the injected
+error, any suspended reader on the other end is resumed with
+`cond::eof`, and all subsequent operations on both ends return
+`cond::eof`.
+
+=== Thread Safety
+
+Single-threaded only. Both ends of the pair must be accessed from the
+same thread. Concurrent access from multiple threads or multiple
+concurrent coroutines is undefined behavior.
+
+[cols="1,2"]
+|===
+| Function / Member | Description
+
+| `make_stream_pair(fuse f = {}) -> std::pair<stream, stream>`
+| Create a connected pair sharing the supplied fuse.
+
+| `read_some(MutableBufferSequence buffers)`
+| Partial read from the peer's outgoing data. Suspends if no data is
+  available. Returns `cond::eof` when the stream is closed or the peer
+  called `close()`. Consults the fuse before every read (unless
+  draining after `close()`).
+
+| `write_some(ConstBufferSequence buffers)`
+| Partial write into the peer's incoming buffer. Resumes a suspended
+  peer reader if any. Returns `cond::eof` if the stream is closed.
+  Consults the fuse before every write.
+
+| `close()`
+| Signal EOF to the peer's reads. Buffered data is drained first.
+  Writes from the peer are unaffected.
+
+| `set_max_read_size(std::size_t n)`
+| Limit bytes returned per `read_some` call on this end, simulating
+  chunked network delivery (applies to this end only; the peer end has
+  its own independent limit).
+
+| `provide(std::string_view sv)`
+| Inject bytes into this stream for reading, bypassing the fuse.
+  Resumes a suspended `read_some` if any.
+
+| `expect(std::string_view expected) -> std::pair<std::error_code, bool>`
+| Read exactly `expected.size()` bytes and compare. Returns the error
+  code and whether the data matched.
+
+| `data() -> std::string_view`
+| Return a view of the unread bytes buffered in this stream.
+|===
+
+== Putting It Together
+
+The following snippet tests a function that reads a single line
+terminated by `'\n'` from a `ReadStream`. The `fuse.armed()` loop
+runs the coroutine repeatedly, failing at every `read_some` call in
+turn, then reruns in exception mode. Each injected failure exercises
+a different error-handling branch inside `read_line`.
+
+[source,cpp]
+----
+#include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/concept/read_stream.hpp>
+#include <boost/capy/task.hpp>
+#include <boost/capy/test/fuse.hpp>
+#include <boost/capy/test/read_stream.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+// Function under test: read until '\n' or EOF
+template<ReadStream S>
+task<std::pair<std::error_code, std::string>>
+read_line(S& stream)
+{
+    std::string line;
+    char ch;
+    for(;;)
+    {
+        auto [ec, n] = co_await stream.read_some(
+            mutable_buffer(&ch, 1));
+        if(ec)
+            co_return {ec, std::move(line)};
+        if(ch == '\n')
+            break;
+        line += ch;
+    }
+    co_return {std::error_code{}, std::move(line)};
+}
+
+void test_read_line()
+{
+    fuse f;
+    auto r = f.armed([&](fuse&) -> task<void> {
+        read_stream rs(f);
+        rs.provide("hello\n");
+
+        auto [ec, line] = co_await read_line(rs);
+        if(ec)
+            co_return;  // fuse injected an error; exit gracefully
+        BOOST_TEST(line == "hello");
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+== Reference
+
+[cols="1,3"]
+|===
+| Header | Contents
+
+| `<boost/capy/test/read_stream.hpp>`
+| Mock ReadStream with controllable partial reads.
+
+| `<boost/capy/test/write_stream.hpp>`
+| Mock WriteStream with controllable partial writes and expectations.
+
+| `<boost/capy/test/stream.hpp>`
+| Connected bidirectional pair for client/server tests.
+|===
+
+Continue to xref:7.testing/7c.mock-sources-sinks.adoc[Mock Sources and Sinks].
diff --git a/doc/modules/ROOT/pages/7.testing/7c.mock-sources-sinks.adoc b/doc/modules/ROOT/pages/7.testing/7c.mock-sources-sinks.adoc
new file mode 100644
index 000000000..d3308f125
--- /dev/null
+++ b/doc/modules/ROOT/pages/7.testing/7c.mock-sources-sinks.adoc
@@ -0,0 +1,291 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+= Mock Sources and Sinks
+
+Concept-conforming test doubles for the complete-I/O concepts in
+xref:6.streams/6c.sources-sinks.adoc[Sources and Sinks]. Sources fill the
+buffer completely (looping internally if needed); sinks accept all bytes
+and report EOF.
+
+== read_source
+
+`read_source` implements the `ReadSource` concept. Test code stages bytes
+via `provide()`, then the system under test calls `read()` and receives the
+entire requested length back (or an error or EOF). The attached `fuse`
+injects errors at every read call, exercising the caller's error-handling
+paths. Because `fuse` copies share state (see
+xref:7.testing/7a.drivers.adoc#_shared_state_across_copies[Shared State Across Copies]),
+constructing `read_source rs(f)` by value still ties `rs` to the same
+fail-point machinery as `f`.
+
+[source,cpp]
+----
+#include <boost/capy/test/read_source.hpp>
+#include <boost/capy/test/fuse.hpp>
+#include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/task.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+void test_read_source()
+{
+    fuse f;
+    read_source rs(f);
+    rs.provide("Hello, ");
+    rs.provide("World!");
+
+    auto r = f.armed([&](fuse&) -> task<void> {
+        char buf[32];
+        auto [ec, n] = co_await rs.read(
+            mutable_buffer(buf, sizeof(buf)));
+        if(ec)
+            co_return;
+        BOOST_TEST(std::string_view(buf, n) == "Hello, World!");
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+=== Complete vs. Partial Reads
+
+`read_source` exposes both `read()` and `read_some()`. The distinction
+matters:
+
+`read_some()` is a partial read, inherited from `ReadStream`. It returns
+up to `max_read_size` bytes per call and may return fewer bytes than the
+buffer can hold. Callers must loop to fill a buffer.
+
+`read()` is a complete read, satisfying `ReadSource`. It transfers all
+available data in a single operation, ignoring the `max_read_size` limit.
+On success `n` equals `buffer_size(buffers)`. If available data runs out
+before the buffer is filled, `read()` returns `cond::eof` with `n`
+set to however many bytes were transferred. Callers do not need to loop.
+
+This is the key behavioral difference from `read_stream::read_some()`,
+which always returns a partial result and never fills the buffer on its
+own.
+
+[cols="1,2"]
+|===
+| Member | Description
+
+| `explicit read_source(fuse f = {}, std::size_t max_read_size = std::size_t(-1))`
+| Construct with an optional shared `fuse` and an optional per-read byte limit.
+  When omitted, the fuse is inert and reads return all available data at once.
+  Set `max_read_size` to simulate chunked delivery; the limit applies to
+  `read_some()` only -- `read()` ignores it.
+
+| `provide(std::string_view sv)`
+| Append bytes to the internal buffer for subsequent reads. Multiple
+  calls accumulate data.
+
+| `read(MutableBufferSequence buffers)`
+| Complete read. Transfers all available data in a single step, ignoring
+  `max_read_size`. Returns `cond::eof` with partial `n` if data runs
+  out before the buffer is filled. Consults the fuse before every call.
+
+| `read_some(MutableBufferSequence buffers)`
+| Partial read. Returns up to `max_read_size` bytes (or all available if
+  no limit was set). Returns `cond::eof` when the buffer is drained.
+  Consults the fuse before every call.
+
+| `available() -> std::size_t`
+| Return the number of bytes remaining to be read.
+
+| `clear()`
+| Clear all data and reset the read position.
+|===
+
+== write_sink
+
+`write_sink` implements the `WriteSink` concept. The system under test
+calls `write()` and `write_eof()` while the test inspects what was written
+via `data()` and checks whether EOF was signaled via `eof_called()`.
+Test code may also call `expect()` to register the data it anticipates;
+any mismatch between written bytes and that prefix causes `write_some()`
+to return `error::test_failure`. Because `fuse` copies share state (see
+xref:7.testing/7a.drivers.adoc#_shared_state_across_copies[Shared State Across Copies]),
+constructing `write_sink ws(f)` by value still ties `ws` to the same
+fail-point machinery as `f`.
+
+[source,cpp]
+----
+#include <boost/capy/test/write_sink.hpp>
+#include <boost/capy/test/fuse.hpp>
+#include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/task.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+void test_write_sink()
+{
+    fuse f;
+    write_sink ws(f);
+
+    auto r = f.armed([&](fuse&) -> task<void> {
+        auto [ec, n] = co_await ws.write(
+            const_buffer("Hello", 5));
+        if(ec)
+            co_return;
+        auto [ec2] = co_await ws.write_eof();
+        if(ec2)
+            co_return;
+    });
+    BOOST_TEST(r.success);
+    BOOST_TEST(ws.data() == "Hello");
+    BOOST_TEST(ws.eof_called());
+}
+----
+
+=== EOF Signal
+
+`write_eof()` is the explicit end-of-stream marker, with no analog in
+`write_stream`. Some protocols treat connection close as the end-of-body
+signal (HTTP/1.0 without `Content-Length` is one example), so the sink
+needs a way to capture that event separately from the data transfer.
+
+`write_sink` provides two forms of the signal:
+
+* `write_eof()` -- signal EOF without data.
+* `write_eof(buffers)` -- atomically write the last chunk and signal EOF
+  in a single awaitable. This form lets protocol code optimize the final
+  send so data and the termination marker travel together.
+
+After either form succeeds, `eof_called()` returns `true`. The fuse is
+consulted before the operation, so both forms participate in error
+injection.
+
+[cols="1,2"]
+|===
+| Member | Description
+
+| `explicit write_sink(fuse f = {}, std::size_t max_write_size = std::size_t(-1))`
+| Construct with an optional shared `fuse` and an optional per-write byte limit.
+  When omitted, the fuse is inert and writes accept all bytes at once.
+  Set `max_write_size` to simulate chunked delivery; the limit applies to
+  `write_some()` only -- `write()` and `write_eof(buffers)` ignore it.
+
+| `write(ConstBufferSequence buffers)`
+| Complete write. Transfers all bytes from `buffers` to the internal
+  buffer, ignoring `max_write_size`. Checks against expected data after
+  appending; on mismatch returns `(error::test_failure, n)` with the
+  appended bytes left in place. Consults the fuse before every call.
+
+| `write_some(ConstBufferSequence buffers)`
+| Partial write. Appends up to `max_write_size` bytes to the internal
+  buffer, then checks against the expected prefix. On mismatch, rolls
+  back the appended bytes and returns `(error::test_failure, 0)` -- in
+  contrast to `write()`, which leaves the partial write in place.
+  Consults the fuse before every call.
+
+| `write_eof(ConstBufferSequence buffers)`
+| Atomically write remaining bytes and signal end-of-stream. Sets
+  `eof_called()` to `true` on success. Consults the fuse before the call.
+
+| `write_eof()`
+| Signal end-of-stream without writing data. Sets `eof_called()` to
+  `true` on success. Consults the fuse before the call.
+
+| `data() -> std::string_view`
+| Return bytes written but not yet matched by `expect()`.
+
+| `size() -> std::size_t`
+| Return the number of bytes written.
+
+| `eof_called() -> bool`
+| Return `true` if `write_eof()` or `write_eof(buffers)` has succeeded.
+
+| `expect(std::string_view sv) -> std::error_code`
+| Register expected data and immediately check any already-written bytes.
+  Matched bytes are consumed from both sides. Returns an error if existing
+  data does not match.
+
+| `clear()`
+| Clear all data, expected data, and reset `eof_called` to `false`.
+|===
+
+== Putting It Together
+
+The following snippet tests a request-handler coroutine that reads a
+fixed-size request from a `ReadSource`, processes it, and writes the
+response to a `WriteSink`. The `fuse.armed()` loop exercises every
+error site in both the read and write paths.
+
+[source,cpp]
+----
+#include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/concept/read_source.hpp>
+#include <boost/capy/concept/write_sink.hpp>
+#include <boost/capy/cond.hpp>
+#include <boost/capy/task.hpp>
+#include <boost/capy/test/fuse.hpp>
+#include <boost/capy/test/read_source.hpp>
+#include <boost/capy/test/write_sink.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+// Function under test: echo the request back as the response
+template<ReadSource Source, WriteSink Sink>
+task<std::error_code>
+handle_request(Source& source, Sink& sink)
+{
+    char buf[64];
+    auto [ec, n] = co_await source.read(
+        mutable_buffer(buf, sizeof(buf)));
+    if(ec && ec != cond::eof)
+        co_return ec;
+
+    auto [ec2, n2] = co_await sink.write(
+        const_buffer(buf, n));
+    if(ec2)
+        co_return ec2;
+
+    auto [ec3] = co_await sink.write_eof();
+    if(ec3)
+        co_return ec3;
+
+    co_return std::error_code{};
+}
+
+void test_handle_request()
+{
+    fuse f;
+    auto r = f.armed([&](fuse&) -> task<void> {
+        read_source rs(f);
+        write_sink  ws(f);
+        rs.provide("ping");
+
+        auto ec = co_await handle_request(rs, ws);
+        if(ec)
+            co_return;  // fuse injected an error; exit gracefully
+        BOOST_TEST(ws.data() == "ping");
+        BOOST_TEST(ws.eof_called());
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+== Reference
+
+[cols="1,3"]
+|===
+| Header | Contents
+
+| `<boost/capy/test/read_source.hpp>`
+| Mock ReadSource with complete reads.
+
+| `<boost/capy/test/write_sink.hpp>`
+| Mock WriteSink with complete writes and explicit EOF.
+|===
+
+Continue to xref:7.testing/7d.mock-buffer-concepts.adoc[Mock Buffer Sources and Sinks].
diff --git a/doc/modules/ROOT/pages/7.testing/7d.mock-buffer-concepts.adoc b/doc/modules/ROOT/pages/7.testing/7d.mock-buffer-concepts.adoc
new file mode 100644
index 000000000..028203b73
--- /dev/null
+++ b/doc/modules/ROOT/pages/7.testing/7d.mock-buffer-concepts.adoc
@@ -0,0 +1,343 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+= Mock Buffer Sources and Sinks
+
+Concept-conforming test doubles for the buffer concepts in
+xref:6.streams/6d.buffer-concepts.adoc[Buffer Sources and Sinks]. These
+mocks let you test code that consumes via a `BufferSource` or produces via
+a `BufferSink` without wiring up a real dynamic buffer.
+
+== buffer_source
+
+`buffer_source` implements the `BufferSource` concept. Test code stages
+bytes via `provide()`, and the system under test pulls them through the
+`pull()`/`consume()` interface that `BufferSource` requires. Pulled buffers
+point directly into the source's internal storage, so no copy occurs. The
+attached `fuse` injects errors at every `pull()` call, exercising the
+caller's error-handling paths. Because `fuse` copies share state (see
+xref:7.testing/7a.drivers.adoc#_shared_state_across_copies[Shared State Across Copies]),
+constructing `buffer_source bs(f)` by value still ties `bs` to the same
+fail-point machinery as `f`.
+
+[source,cpp]
+----
+#include <boost/capy/test/buffer_source.hpp>
+#include <boost/capy/test/fuse.hpp>
+#include <boost/capy/test/buffer_to_string.hpp>
+#include <boost/capy/buffers.hpp>
+#include <boost/capy/task.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+void test_buffer_source()
+{
+    fuse f;
+    buffer_source bs(f);
+    bs.provide("Hello, ");
+    bs.provide("World!");
+
+    auto r = f.armed([&](fuse&) -> task<void> {
+        const_buffer arr[16];
+        auto [ec, bufs] = co_await bs.pull(arr);
+        if(ec)
+            co_return;
+        BOOST_TEST(buffer_to_string(bufs) == "Hello, World!");
+        bs.consume(buffer_size(bufs));
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+=== Staging Data
+
+Call `provide()` one or more times before the system under test runs.
+Each call appends bytes to the internal buffer; the next `pull()` returns
+a span covering all accumulated unconsumed data, up to `max_pull_size` if
+a limit was set.
+
+[source,cpp]
+----
+buffer_source bs(f);
+bs.provide("part one ");
+bs.provide("part two");  // total: "part one part two"
+----
+
+=== Consume Loop
+
+`pull()` returns the same data on repeated calls until `consume()` advances
+the read position. A typical consumer loops until `pull()` returns
+`cond::eof`, consuming the returned bytes each time:
+
+[source,cpp]
+----
+const_buffer arr[16];
+for(;;)
+{
+    auto [ec, bufs] = co_await bs.pull(arr);
+    if(ec == cond::eof)
+        break;
+    if(ec)
+        co_return;  // fuse injected error, or real failure
+    // process bufs ...
+    bs.consume(buffer_size(bufs));
+}
+----
+
+=== Chunked Delivery
+
+The second constructor parameter caps the bytes returned per `pull()`,
+simulating a source that delivers data in small pieces:
+
+[source,cpp]
+----
+buffer_source bs(f, 5);   // at most 5 bytes per pull
+bs.provide("hello world");
+// first pull returns "hello"; second returns " worl"; etc.
+----
+
+[cols="1,2"]
+|===
+| Member | Description
+
+| `explicit buffer_source(fuse f = {}, std::size_t max_pull_size = std::size_t(-1))`
+| Construct with an optional shared `fuse` and an optional per-pull byte ceiling.
+  When omitted, the fuse is inert and each pull returns all remaining data.
+  Set `max_pull_size` to simulate chunked delivery.
+
+| `provide(std::string_view sv)`
+| Append bytes to the internal buffer for subsequent pulls. Multiple
+  calls accumulate data.
+
+| `pull(std::span<const_buffer> dest)`
+| Fills `dest` with buffer descriptors pointing into internal storage.
+  Await-returns `(error_code, std::span<const_buffer>)`. Returns `cond::eof`
+  when no data remains. Consults the fuse before every call. Repeated
+  calls without `consume()` return the same data.
+
+| `consume(std::size_t n)`
+| Advance the read position by `n` bytes. The next `pull()` returns data
+  starting after the consumed bytes.
+
+| `available() -> std::size_t`
+| Return the number of bytes not yet consumed.
+
+| `clear()`
+| Clear all data and reset the read position.
+|===
+
+== buffer_sink
+
+`buffer_sink` implements the `BufferSink` concept. The system under test
+follows the callee-owns-buffers pattern: it calls `prepare()` to get
+writable buffer space from the sink, writes directly into those buffers,
+then calls `commit()` or `commit_eof()` to finalize the bytes. The test
+then inspects what was captured via `data()` and checks whether the
+end-of-stream was signaled via `eof_called()`. The attached `fuse`
+injects errors at every async step. Because `fuse` copies share state (see
+xref:7.testing/7a.drivers.adoc#_shared_state_across_copies[Shared State Across Copies]),
+constructing `buffer_sink bs(f)` by value still ties `bs` to the same
+fail-point machinery as `f`.
+
+[source,cpp]
+----
+#include <boost/capy/test/buffer_sink.hpp>
+#include <boost/capy/test/fuse.hpp>
+#include <boost/capy/task.hpp>
+
+#include <cstring>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+void test_buffer_sink()
+{
+    fuse f;
+    auto r = f.armed([&](fuse&) -> task<void> {
+        buffer_sink bs(f);
+
+        mutable_buffer arr[16];
+        auto bufs = bs.prepare(arr);
+
+        std::memcpy(bufs[0].data(), "Hello", 5);
+
+        auto [ec] = co_await bs.commit(5);
+        if(ec)
+            co_return;
+
+        auto [ec2] = co_await bs.commit_eof(0);
+        if(ec2)
+            co_return;
+
+        BOOST_TEST(bs.data() == "Hello");
+        BOOST_TEST(bs.eof_called());
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+=== Reading What Was Written
+
+After the coroutine completes, `data()` returns a `string_view` of all
+committed bytes. `size()` gives the byte count. `eof_called()` returns
+`true` if `commit_eof()` succeeded during the run.
+
+[source,cpp]
+----
+BOOST_TEST(bs.data() == "expected output");
+BOOST_TEST(bs.size() == 15u);
+BOOST_TEST(bs.eof_called());
+----
+
+Call these accessors inside the `f.armed()` lambda after the system
+under test completes successfully. They are the primary mechanism for
+asserting what the system under test produced.
+
+=== The prepare/commit Protocol
+
+`prepare()` is synchronous. It fills the provided span with one writable
+buffer descriptor pointing into the sink's internal storage. The caller
+writes data into those buffers, then calls `commit(n)` to finalize `n`
+bytes, or `commit_eof(n)` to finalize `n` bytes and signal end-of-stream
+in a single step. Passing `n = 0` to `commit_eof` signals EOF without
+writing additional bytes.
+
+=== Limited Buffer Space
+
+The second constructor parameter caps the bytes available per `prepare()`,
+simulating a sink with constrained internal space:
+
+[source,cpp]
+----
+buffer_sink bs(f, 8);   // prepare returns at most 8 bytes at a time
+----
+
+[cols="1,2"]
+|===
+| Member | Description
+
+| `explicit buffer_sink(fuse f = {}, std::size_t max_prepare_size = 4096)`
+| Construct with an optional shared `fuse` and an optional per-prepare byte ceiling.
+  When omitted, the fuse is inert and `prepare()` exposes `4096` bytes of buffer
+  space. Set `max_prepare_size` to simulate limited buffer space.
+
+| `prepare(std::span<mutable_buffer> dest)`
+| Synchronously fills `dest` with writable buffer descriptors into
+  internal storage. Returns the filled span (one buffer in this
+  implementation, or empty if `dest` is empty). Does not consult
+  the fuse.
+
+| `commit(std::size_t n)`
+| Finalize `n` bytes written to the most recent `prepare()` buffers.
+  Await-returns `(error_code)`. Consults the fuse before committing.
+
+| `commit_eof(std::size_t n)`
+| Finalize `n` bytes and signal end-of-stream. Await-returns
+  `(error_code)`. Sets `eof_called()` to `true` on success. Consults
+  the fuse before committing. Pass `n = 0` to signal EOF without
+  additional data.
+
+| `data() -> std::string_view`
+| Return all bytes committed so far.
+
+| `size() -> std::size_t`
+| Return the number of bytes committed.
+
+| `eof_called() -> bool`
+| Return `true` if `commit_eof()` has succeeded.
+
+| `clear()`
+| Clear all committed data and reset `eof_called` to `false`.
+|===
+
+== Putting It Together
+
+The following snippet tests a copy algorithm that pulls from a
+`BufferSource` and writes into a `BufferSink`. The `fuse.armed()` loop
+exercises every error site in both the pull and commit paths.
+
+[source,cpp]
+----
+#include <boost/capy/buffers.hpp>
+#include <boost/capy/concept/buffer_sink.hpp>
+#include <boost/capy/concept/buffer_source.hpp>
+#include <boost/capy/cond.hpp>
+#include <boost/capy/task.hpp>
+#include <boost/capy/test/buffer_sink.hpp>
+#include <boost/capy/test/buffer_source.hpp>
+#include <boost/capy/test/fuse.hpp>
+
+#include <cstring>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+// Function under test: copy all bytes from source into sink
+template<BufferSource Source, BufferSink Sink>
+task<std::error_code>
+copy_all(Source& source, Sink& sink)
+{
+    const_buffer src_arr[16];
+    mutable_buffer dst_arr[16];
+
+    for(;;)
+    {
+        auto [ec1, src_bufs] = co_await source.pull(src_arr);
+        if(ec1 == cond::eof)
+        {
+            auto [eof_ec] = co_await sink.commit_eof(0);
+            co_return eof_ec;
+        }
+        if(ec1)
+            co_return ec1;
+
+        auto dst_bufs = sink.prepare(dst_arr);
+        std::size_t n = buffer_copy(dst_bufs, src_bufs);
+
+        auto [ec2] = co_await sink.commit(n);
+        if(ec2)
+            co_return ec2;
+
+        source.consume(n);
+    }
+}
+
+void test_copy_all()
+{
+    fuse f;
+    auto r = f.armed([&](fuse&) -> task<void> {
+        buffer_source src(f);
+        buffer_sink   dst(f);
+        src.provide("ping");
+
+        auto ec = co_await copy_all(src, dst);
+        if(ec)
+            co_return;  // fuse injected an error; exit gracefully
+        BOOST_TEST(dst.data() == "ping");
+        BOOST_TEST(dst.eof_called());
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+== Reference
+
+[cols="1,3"]
+|===
+| Header | Contents
+
+| `<boost/capy/test/buffer_source.hpp>`
+| Mock BufferSource for callee-owns-buffers pull tests.
+
+| `<boost/capy/test/buffer_sink.hpp>`
+| Mock BufferSink for callee-owns-buffers write tests.
+|===
+
+Continue to xref:7.testing/7e.buffer-inspection.adoc[Buffer Inspection].
diff --git a/doc/modules/ROOT/pages/7.testing/7e.buffer-inspection.adoc b/doc/modules/ROOT/pages/7.testing/7e.buffer-inspection.adoc
new file mode 100644
index 000000000..5b8aa3481
--- /dev/null
+++ b/doc/modules/ROOT/pages/7.testing/7e.buffer-inspection.adoc
@@ -0,0 +1,271 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+= Buffer Inspection
+
+Two small utilities round out the toolkit. `bufgrind` iterates every split
+point of a buffer sequence, exercising every chunk-boundary condition in the
+system under test. `buffer_to_string` concatenates buffer sequences into a
+`std::string` for assertion.
+
+== bufgrind
+
+`bufgrind` is a test utility that iterates through every way to split a buffer
+sequence into two contiguous pieces. For an N-byte input it produces N+1 split
+positions: `(0, N)`, `(1, N-1)`, ..., `(N, 0)`. The two pieces at each
+position concatenate back to the original sequence. Any code that processes
+a buffer in chunks is exercised at every possible chunk boundary with a single
+`while` loop.
+
+`bufgrind` does not perform I/O and does not consult a fuse, so the snippets
+on this page drive it under `f.inert(...)`: a single pass is sufficient to
+visit every split position, and there are no async failure sites to inject.
+
+[source,cpp]
+----
+#include <boost/capy/test/bufgrind.hpp>
+#include <boost/capy/test/buffer_to_string.hpp>
+#include <boost/capy/test/fuse.hpp>
+#include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/task.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+void test_all_splits()
+{
+    std::string data = "hello";
+    auto cb = make_buffer(data);
+
+    fuse f;
+    auto r = f.inert([&](fuse&) -> task<> {
+        bufgrind bg(cb);
+        while(bg)
+        {
+            auto [b1, b2] = co_await bg.next();
+            BOOST_TEST_EQ(buffer_to_string(b1, b2), data);
+        }
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+=== Iteration Pattern
+
+For a 5-byte input `"hello"`, `bufgrind` yields six positions:
+
+----
+pos=0: b1=""      b2="hello"
+pos=1: b1="h"     b2="ello"
+pos=2: b1="he"    b2="llo"
+pos=3: b1="hel"   b2="lo"
+pos=4: b1="hell"  b2="o"
+pos=5: b1="hello" b2=""
+----
+
+An empty buffer sequence yields one position where both pieces are empty,
+so the loop body always executes at least once.
+
+=== Step Size
+
+When the input is large, visiting every byte boundary is expensive. Pass a
+`step` parameter to skip positions. The final position (equal to the total
+size) is always visited regardless of step alignment.
+
+[source,cpp]
+----
+std::string data = "0123456789";  // 10 bytes
+auto cb = make_buffer(data);
+
+bufgrind bg(cb, 3);
+// Visits positions: 0, 3, 6, 9, 10
+while(bg)
+{
+    auto [b1, b2] = co_await bg.next();
+    // exercise parser at each split point
+}
+----
+
+A step of 0 is treated as 1. A step larger than the total size reduces to
+two positions: 0 and size.
+
+=== Mutability Preservation
+
+`bufgrind` is templated on a `ConstBufferSequence` but the split type it
+produces follows the mutability of the input. Passing a `mutable_buffer`
+yields `mutable_buffer` slices; passing a `const_buffer` yields
+`const_buffer` slices. This matters for tests that need to write into the
+produced buffers rather than only read from them.
+
+[source,cpp]
+----
+char data[] = "hello";
+mutable_buffer mb(data, 5);
+
+bufgrind bg(mb);
+while(bg)
+{
+    auto [b1, b2] = co_await bg.next();
+    // b1 and b2 are mutable_buffer; callers may write into them
+    static_assert(std::is_same_v<decltype(b1), mutable_buffer>);
+}
+----
+
+[cols="1,2"]
+|===
+| Member | Description
+
+| `bufgrind(BS const& bs, std::size_t step = 1)`
+| Construct over a buffer sequence. `step` controls how many bytes to
+  advance on each call to `next()`. A step of 0 is treated as 1.
+  The final split at `buffer_size(bs)` is always included.
+
+| `operator bool() const`
+| Return `true` while more split positions remain.
+
+| `next()`
+| Advance to the next split position. Returns an awaitable that yields
+  `split_type`, a `std::pair` of `slice_type<BS>` values representing
+  the two pieces at the current position.
+|===
+
+== buffer_to_string
+
+`buffer_to_string` concatenates one or more buffer sequences into a
+`std::string`. With a single argument it converts that buffer sequence;
+with multiple arguments it concatenates them in order. The most common
+use is asserting the combined content of a `bufgrind` split.
+
+[source,cpp]
+----
+#include <boost/capy/test/buffer_to_string.hpp>
+#include <boost/capy/buffers/make_buffer.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+void test_buffer_to_string()
+{
+    // Single buffer sequence
+    const_buffer cb(make_buffer(std::string_view("hello")));
+    BOOST_TEST_EQ(buffer_to_string(cb), "hello");
+
+    // Multiple buffer sequences concatenated in order
+    const_buffer b1(make_buffer(std::string_view("hello")));
+    const_buffer b2(make_buffer(std::string_view(" world")));
+    BOOST_TEST_EQ(buffer_to_string(b1, b2), "hello world");
+}
+----
+
+=== Use With bufgrind
+
+The typical pattern passes both halves of a `bufgrind` split directly to
+`buffer_to_string` to verify that each split reconstructs the original
+input:
+
+[source,cpp]
+----
+std::string original = "hello world";
+auto cb = make_buffer(original);
+
+fuse f;
+auto r = f.inert([&](fuse&) -> task<> {
+    bufgrind bg(cb);
+    while(bg)
+    {
+        auto [b1, b2] = co_await bg.next();
+        BOOST_TEST_EQ(buffer_to_string(b1, b2), original);
+    }
+});
+BOOST_TEST(r.success);
+----
+
+[cols="1,2"]
+|===
+| Function | Description
+
+| `buffer_to_string(Buffers const&... bufs) -> std::string`
+| Concatenate one or more `ConstBufferSequence` arguments into a single
+  `std::string`. Arguments are appended in the order given.
+|===
+
+== Putting It Together
+
+The following snippet tests a hypothetical parser that reads from a
+`read_stream`. `bufgrind` exercises every split of the input so the parser
+is run against every possible chunk boundary; `buffer_to_string` verifies
+the output at each split:
+
+[source,cpp]
+----
+#include <boost/capy/test/bufgrind.hpp>
+#include <boost/capy/test/buffer_to_string.hpp>
+#include <boost/capy/test/fuse.hpp>
+#include <boost/capy/test/read_stream.hpp>
+#include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/task.hpp>
+
+using namespace boost::capy;
+using namespace boost::capy::test;
+
+// Hypothetical parser: reads all bytes from a ReadStream
+task<std::string> read_all(read_stream& rs)
+{
+    std::string out;
+    std::array<char, 64> buf;
+    for(;;)
+    {
+        auto [ec, n] = co_await rs.read_some(make_buffer(buf));
+        if(ec)
+            co_return out;
+        out.append(buf.data(), n);
+    }
+}
+
+void test_parser_all_splits()
+{
+    std::string input = "GET / HTTP/1.1\r\n";
+    auto cb = make_buffer(input);
+
+    fuse f;
+    auto r = f.inert([&](fuse&) -> task<> {
+        bufgrind bg(cb);
+        while(bg)
+        {
+            auto [b1, b2] = co_await bg.next();
+
+            // Feed the split as two discrete reads
+            read_stream rs(f);
+            rs.provide(buffer_to_string(b1));
+            rs.provide(buffer_to_string(b2));
+
+            std::string got = co_await read_all(rs);
+            BOOST_TEST_EQ(got, input);
+        }
+    });
+    BOOST_TEST(r.success);
+}
+----
+
+== Reference
+
+[cols="1,3"]
+|===
+| Header | Contents
+
+| `<boost/capy/test/bufgrind.hpp>`
+| Exhaustive buffer split-point iterator.
+
+| `<boost/capy/test/buffer_to_string.hpp>`
+| Buffer-sequence to string helper.
+|===
+
+You have reached the end of the Testing section. Continue
+to xref:8.examples/8.intro.adoc[Example Programs] for end-to-end usage
+or xref:reference:boost/capy.adoc[Reference] for the API browser.
diff --git a/doc/modules/ROOT/pages/8.design/8c.ReadStream.adoc b/doc/modules/ROOT/pages/8.design/8c.ReadStream.adoc
deleted file mode 100644
index f37c1bb78..000000000
--- a/doc/modules/ROOT/pages/8.design/8c.ReadStream.adoc
+++ /dev/null
@@ -1,370 +0,0 @@
-= ReadStream Concept Design
-
-== Overview
-
-This document describes the design of the `ReadStream` concept: the fundamental partial-read primitive in the concept hierarchy. It explains why `read_some` is the correct building block, how composed algorithms build on top of it, and the relationship to `ReadSource`.
-
-== Definition
-
-[source,cpp]
-----
-template<typename T>
-concept ReadStream =
-    requires(T& stream, mutable_buffer_archetype buffers)
-    {
-        { stream.read_some(buffers) } -> IoAwaitable;
-        requires awaitable_decomposes_to<
-            decltype(stream.read_some(buffers)),
-            std::error_code, std::size_t>;
-    };
-----
-
-A `ReadStream` provides a single operation:
-
-=== `read_some(buffers)` -- Partial Read
-
-Reads one or more bytes from the stream into the buffer sequence. Returns `(error_code, std::size_t)` where `n` is the number of bytes read.
-
-==== Semantics
-
-- On success: `!ec`, `n >= 1` and `n \<= buffer_size(buffers)`.
-- On EOF: `ec == cond::eof`, `n == 0`.
-- On error: `ec`, `n == 0`.
-- If `buffer_empty(buffers)`: completes immediately, `!ec`, `n == 0`.
-
-The caller must not assume the buffer is filled. `read_some` may return fewer bytes than the buffer can hold. This is the defining property of a partial-read primitive.
-
-Once `read_some` returns an error (including EOF), the caller must not call `read_some` again. The stream is done. Not all implementations can reproduce a prior error on subsequent calls, so the behavior after an error is undefined.
-
-Buffers in the sequence are filled completely before proceeding to the next buffer in the sequence.
-
-==== Buffer Lifetime
-
-The caller must ensure that the memory referenced by `buffers` remains valid until the `co_await` expression returns.
-
-==== Conforming Signatures
-
-[source,cpp]
-----
-template<MutableBufferSequence Buffers>
-IoAwaitable auto read_some(Buffers buffers);
-----
-
-Buffer sequences should be accepted by value when the member function is a coroutine, to ensure the sequence lives in the coroutine frame across suspension points.
-
-== Concept Hierarchy
-
-`ReadStream` is the base of the read-side hierarchy:
-
-----
-ReadStream    { read_some }
-    |
-    v
-ReadSource    { read_some, read }
-----
-
-`ReadSource` refines `ReadStream`. Every `ReadSource` is a `ReadStream`. Algorithms constrained on `ReadStream` accept both raw streams and sources. The `ReadSource` concept adds a complete-read primitive on top of the partial-read primitive.
-
-This mirrors the write side:
-
-----
-WriteStream   { write_some }
-    |
-    v
-WriteSink     { write_some, write, write_eof(buffers), write_eof() }
-----
-
-== Composed Algorithms
-
-Three composed algorithms build on `read_some`:
-
-=== `read(stream, buffers)` -- Fill a Buffer Sequence
-
-[source,cpp]
-----
-auto read(ReadStream auto& stream,
-          MutableBufferSequence auto const& buffers)
-    -> io_task<std::size_t>;
-----
-
-Loops `read_some` until the entire buffer sequence is filled or an error (including EOF) occurs. On success, `n == buffer_size(buffers)`.
-
-[source,cpp]
-----
-template<ReadStream Stream>
-task<> read_header(Stream& stream)
-{
-    char header[16];
-    auto [ec, n] = co_await read(
-        stream, mutable_buffer(header));
-    if(ec == cond::eof)
-        co_return;  // clean shutdown
-    if(ec)
-        co_return;
-    // header contains exactly 16 bytes
-}
-----
-
-=== `read(stream, dynamic_buffer)` -- Read Until EOF
-
-[source,cpp]
-----
-auto read(ReadStream auto& stream,
-          DynamicBufferParam auto&& buffers,
-          std::size_t initial_amount = 2048)
-    -> io_task<std::size_t>;
-----
-
-Reads from the stream into a dynamic buffer until EOF is reached. The buffer grows with a 1.5x factor when filled. On success (EOF), `ec` is clear and `n` is the total bytes read.
-
-[source,cpp]
-----
-template<ReadStream Stream>
-task<std::string> slurp(Stream& stream)
-{
-    std::string body;
-    auto [ec, n] = co_await read(
-        stream, string_dynamic_buffer(&body));
-    if(ec)
-        co_return {};
-    co_return body;
-}
-----
-
-=== `read_until(stream, dynamic_buffer, match)` -- Delimited Read
-
-Reads from the stream into a dynamic buffer until a delimiter or match condition is found. Used for line-oriented protocols and message framing.
-
-[source,cpp]
-----
-template<ReadStream Stream>
-task<> read_line(Stream& stream)
-{
-    std::string line;
-    auto [ec, n] = co_await read_until(
-        stream, string_dynamic_buffer(&line), "\r\n");
-    if(ec)
-        co_return;
-    // line contains data up to and including "\r\n"
-}
-----
-
-== Use Cases
-
-=== Incremental Processing with `read_some`
-
-When processing data as it arrives without waiting for a full buffer, `read_some` is the right choice. This is common for real-time data or when the processing can handle partial input.
-
-[source,cpp]
-----
-template<ReadStream Stream>
-task<> echo(Stream& stream, WriteStream auto& dest)
-{
-    char buf[4096];
-    for(;;)
-    {
-        auto [ec, n] = co_await stream.read_some(
-            mutable_buffer(buf));
-        if(ec == cond::eof)
-            co_return;
-        if(ec)
-            co_return;
-
-        // Forward whatever we received immediately
-        auto [wec, nw] = co_await dest.write_some(
-            const_buffer(buf, n));
-        if(wec)
-            co_return;
-    }
-}
-----
-
-=== Relaying from ReadStream to WriteStream
-
-When relaying data from a reader to a writer, `read_some` feeds `write_some` directly. This is the fundamental streaming pattern.
-
-[source,cpp]
-----
-template<ReadStream Src, WriteStream Dest>
-task<> relay(Src& src, Dest& dest)
-{
-    char storage[65536];
-    circular_dynamic_buffer cb(storage, sizeof(storage));
-
-    for(;;)
-    {
-        // Read into free space
-        auto mb = cb.prepare(cb.capacity());
-        auto [rec, nr] = co_await src.read_some(mb);
-        cb.commit(nr);
-
-        if(rec && rec != cond::eof)
-            co_return;
-
-        // Drain to destination
-        while(cb.size() > 0)
-        {
-            auto [wec, nw] = co_await dest.write_some(
-                cb.data());
-            if(wec)
-                co_return;
-            cb.consume(nw);
-        }
-
-        if(rec == cond::eof)
-            co_return;
-    }
-}
-----
-
-Because `ReadSource` refines `ReadStream`, this relay function also accepts `ReadSource` types. An HTTP body source or a decompressor can be relayed to a `WriteStream` using the same function.
-
-== Relationship to the Write Side
-
-[cols="1,1"]
-|===
-| Read Side | Write Side
-
-| `ReadStream::read_some`
-| `WriteStream::write_some`
-
-| `read` free function (composed)
-| `write_now` (composed, eager)
-
-| `read_until` (composed, delimited)
-| No write-side equivalent
-
-| `ReadSource::read`
-| `WriteSink::write`
-|===
-
-== Design Foundations: Why Errors Exclude Data
-
-The `read_some` contract requires that `n` is 0 whenever `ec` is set. Data and errors are mutually exclusive outcomes. This is the most consequential design decision in the `ReadStream` concept, with implications for every consumer of `read_some` in the library. The rule follows Asio's established `AsyncReadStream` contract, is reinforced by the behavior of POSIX and Windows I/O system calls, and produces cleaner consumer code. This section explains the design and its consequences.
-
-=== Reconstructing Kohlhoff's Reasoning
-
-Christopher Kohlhoff's Asio library defines an `AsyncReadStream` concept with the identical requirement: on error, `bytes_transferred` is 0. No design rationale document accompanies this rule. The reasoning presented here was reconstructed from three sources:
-
-- *The Asio source code.* The function `non_blocking_recv1` in `socket_ops.ipp` explicitly sets `bytes_transferred = 0` on every error path. The function `complete_iocp_recv` maps Windows IOCP errors to portable error codes, relying on the operating system's guarantee that failed completions report zero bytes. These are deliberate choices, not accidental pass-through of OS behavior.
-- *A documentation note Kohlhoff left.* Titled "Why EOF is an error," it gives two reasons: composed operations need EOF-as-error to report contract violations, and EOF-as-error disambiguates the end of a stream from a successful zero-byte read. The note is terse but the implications are deep.
-- *Analysis of the underlying system calls.* POSIX `recv()` and Windows `WSARecv()` both enforce a binary outcome per call: data or error, never both. This is not because the {cpp} abstraction copied the OS, but because both levels face the same fundamental constraint.
-
-The following sections examine each of these points and their consequences.
-
-=== Alignment with Asio
-
-Asio's `AsyncReadStream` concept has enforced the same rule for over two decades: on error, `bytes_transferred` is 0. This is a deliberate design choice, not an accident. The Asio source code explicitly zeroes `bytes_transferred` on every error path, and the underlying system calls (POSIX `recv()`, Windows IOCP) enforce binary outcomes at the OS level. The `read_some` contract follows this established practice.
-
-=== The Empty-Buffer Rule
-
-Every `ReadStream` must support the following:
-
-[quote]
-`read_some(empty_buffer)` completes immediately with `{success, 0}`.
-
-This is a no-op. The caller passed no buffer space, so no I/O is attempted. The operation does not inspect the stream's internal state because that would require a probe capability -- a way to ask "is there data? is the stream at EOF?" -- without actually reading. Not every source supports probing. A TCP socket does not know that its peer has closed until it calls `recv()` and gets 0 back. A pipe does not know it is broken until a read fails. The empty-buffer rule is therefore unconditional: return `{success, 0}` regardless of the stream's state.
-
-This rule is a natural consequence of the contract, not a proof of it. When no I/O is attempted, no state is discovered and no error is reported.
-
-=== Why EOF Is an Error
-
-Kohlhoff's documentation note gives two reasons for making EOF an error code rather than a success:
-
-*Composed operations need EOF-as-error to report contract violations.* The composed `read(stream, buffer(buf, 100))` promises to fill exactly 100 bytes. If the stream ends after 50, the operation did not fulfill its contract. Reporting `{success, 50}` would be misleading -- it suggests the operation completed normally. Reporting `{eof, 50}` tells the caller both what happened (50 bytes landed in the buffer) and why the operation stopped (the stream ended). EOF-as-error is the mechanism by which composed operations explain early termination.
-
-*EOF-as-error disambiguates the empty-buffer no-op from the end of a stream.* Without EOF-as-error, both `read_some(empty_buffer)` on a live stream and `read_some(non_empty_buffer)` on an exhausted stream would produce `{success, 0}`. The caller could not distinguish "I passed no buffer" from "the stream is done." Making EOF an error code separates these two cases cleanly.
-
-These two reasons reinforce each other. Composed operations need EOF to be an error code so they can report early termination. The empty-buffer rule needs EOF to be an error code so `{success, 0}` is unambiguously a no-op. Together with the rule that errors exclude data, `read_some` results form a clean trichotomy: success with data, or an error (including EOF) without data.
-
-=== The Write-Side Asymmetry
-
-On the write side, `WriteSink` provides `write_eof(buffers)` to atomically combine the final data with the EOF signal. A natural question follows: if the write side fuses data with EOF, why does the read side forbid it?
-
-The answer is that the two sides of the I/O boundary have different roles. The writer _decides_ when to signal EOF. The reader _discovers_ it. This asymmetry has three consequences:
-
-*`write_eof` exists for correctness, not convenience.* Protocol framings require the final data and the EOF marker to be emitted together so the peer observes a complete message. HTTP chunked encoding needs the terminal `0\r\n\r\n` coalesced with the final data chunk. A TLS session needs the close-notify alert coalesced with the final application data. A compressor needs `Z_FINISH` applied to the final input. These are correctness requirements, not optimizations. On the read side, whether the last bytes arrive with EOF or on a separate call does not change what the reader observes. The data and the order are identical either way.
-
-*`write_eof` is a separate function the caller explicitly invokes.* `write_some` never signals EOF. The writer opts into data-plus-EOF by calling a different function. The call site reads `write_eof(data)` and the intent is unambiguous. If `read_some` could return data with EOF, every call to `read_some` would _sometimes_ be a data-only operation and _sometimes_ a data-plus-EOF operation. The stream decides which mode the caller gets, at runtime. Every call site must handle both possibilities. The burden falls on every consumer in the codebase, not on a single call site that opted into the combined behavior.
-
-*A hypothetical `read_eof` makes no sense.* On the write side, `write_eof` exists because the producer signals the end of data. On the read side, the consumer does not tell the stream to end -- it discovers that the stream has ended. EOF flows from producer to consumer, not the reverse. There is no action the reader can take to "read the EOF." The reader discovers EOF as a side effect of attempting to read.
-
-=== A Clean Trichotomy
-
-With the current contract, every `read_some` result falls into exactly one of three mutually exclusive cases:
-
-- **Success**: `!ec`, `n >= 1` -- data arrived, process it.
-- **EOF**: `ec == cond::eof`, `n == 0` -- stream ended, no data.
-- **Error**: `ec`, `n == 0` -- failure, no data.
-
-Data is present if and only if the operation succeeded. This invariant -- _data implies success_ -- eliminates an entire category of reasoning from every read loop. The common pattern is:
-
-[source,cpp]
-----
-auto [ec, n] = co_await stream.read_some(buf);
-if(ec)
-    break;        // EOF or error -- no data to handle
-process(buf, n);  // only reached on success, n >= 1
-----
-
-If `read_some` could return `n > 0` with EOF, the loop becomes:
-
-[source,cpp]
-----
-auto [ec, n] = co_await stream.read_some(buf);
-if(n > 0)
-    process(buf, n);  // must handle data even on EOF
-if(ec)
-    break;
-----
-
-Every consumer pays this tax: an extra branch to handle data accompanying EOF. The branch is easy to forget. Forgetting it silently drops the final bytes of the stream -- a bug that only manifests when the source delivers EOF with its last data rather than on a separate call. A TCP socket receiving data in one packet and FIN in another will not trigger the bug. A memory source that knows its remaining length will. The non-determinism makes the bug difficult to reproduce and diagnose.
-
-The clean trichotomy eliminates this class of bugs entirely.
-
-=== Conforming Sources
-
-Every concrete `ReadStream` implementation naturally separates its last data delivery from its EOF signal:
-
-- **TCP sockets**: `read_some` maps to a single `recv()` or `WSARecv()` call, returning whatever the kernel has buffered. The kernel delivers bytes on one call and returns 0 on the next. The separation is inherent in the POSIX and Windows APIs.
-- **TLS streams**: `read_some` decrypts and returns one TLS record's worth of application data. The close-notify alert arrives as a separate record.
-- **HTTP content-length body**: the source delivers bytes up to the content-length limit. Once the limit is reached, the next `read_some` returns EOF.
-- **HTTP chunked body**: the unchunker delivers decoded data from chunks. The terminal `0\r\n\r\n` is parsed on a separate pass that returns EOF.
-- **Compression (inflate)**: the decompressor delivers output bytes. When `Z_STREAM_END` is detected, the next read returns EOF.
-- **Memory source**: returns `min(requested, remaining)` bytes. When `remaining` reaches 0, the next call returns EOF.
-- **QUIC streams**: `read_some` returns data from received QUIC frames. Stream FIN is delivered as EOF on a subsequent call.
-- **Buffered read streams**: `read_some` returns data from an internal buffer, refilling from the underlying stream when empty. EOF propagates from the underlying stream.
-- **Test mock streams**: `read_some` returns configurable data and error sequences for testing.
-
-No source is forced into an unnatural pattern. The `read_some` call that discovers EOF is the natural result of attempting to read from an exhausted stream -- not a separate probing step. Once the caller receives EOF, it stops reading.
-
-=== Composed Operations and Partial Results
-
-The composed `read` algorithm (and `ReadSource::read`) _does_ report `n > 0` on EOF, because it accumulates data across multiple internal `read_some` calls. When the underlying stream signals EOF mid-accumulation, discarding the bytes already gathered would be wrong. The caller needs `n` to know how much valid data landed in the buffer.
-
-The design separates concerns cleanly: the single-shot primitive (`read_some`) delivers unambiguous results with a clean trichotomy. Composed operations that accumulate state (`read`) report what they accumulated, including partial results on EOF. Callers who need partial-on-EOF semantics get them through the composed layer, while the primitive layer remains clean.
-
-=== Evidence from the Asio Implementation
-
-The Asio source code confirms this design at every level.
-
-On POSIX platforms, `non_blocking_recv1` in `socket_ops.ipp` calls `recv()` and branches on the result. If `recv()` returns a positive value, the bytes are reported as a successful transfer. If `recv()` returns 0 on a stream socket, EOF is reported. If `recv()` returns -1, the function explicitly sets `bytes_transferred = 0` before returning the error. The POSIX `recv()` system call itself enforces binary outcomes: it returns `N > 0` on success, `0` on EOF, or `-1` on error. A single call never returns both data and an error.
-
-On Windows, `complete_iocp_recv` processes the results from `GetQueuedCompletionStatus`. It maps `ERROR_NETNAME_DELETED` to `connection_reset` and `ERROR_PORT_UNREACHABLE` to `connection_refused`. Windows IOCP similarly reports zero `bytes_transferred` on failed completions. The operating system enforces the same binary outcome per I/O completion.
-
-The one edge case is POSIX signal interruption (`EINTR`). If a signal arrives after `recv()` has already copied some bytes, the kernel returns the partial byte count as success rather than `-1`/`EINTR`. Asio handles this transparently by retrying on `EINTR`, so the caller never observes it. Even the kernel does not combine data with an error -- it chooses to report the partial data as success.
-
-=== Convergent Design with POSIX
-
-POSIX `recv()` independently enforces the same rule: `N > 0` on success, `-1` on error, `0` on EOF. The kernel never returns "here are your last 5 bytes, and also EOF." It delivers the available bytes on one call and returns 0 on the next. This is not because the {cpp} abstraction copied POSIX semantics. It is because the kernel faces the same fundamental constraint: state is discovered through the act of I/O. The alignment between `read_some` and `recv()` is convergent design, not leaky abstraction.
-
-== Summary
-
-`ReadStream` provides `read_some` as the single partial-read primitive. This is deliberately minimal:
-
-- Algorithms that need to fill a buffer completely use the `read` composed algorithm.
-- Algorithms that need delimited reads use `read_until`.
-- Algorithms that need to process data as it arrives use `read_some` directly.
-- `ReadSource` refines `ReadStream` by adding `read` for complete-read semantics.
-
-The contract that errors exclude data follows Asio's established `AsyncReadStream` contract, aligns with POSIX and Windows system call semantics, and produces a clean trichotomy that makes every read loop safe by construction.
diff --git a/doc/modules/ROOT/pages/7.examples/7.intro.adoc b/doc/modules/ROOT/pages/8.examples/8.intro.adoc
similarity index 100%
rename from doc/modules/ROOT/pages/7.examples/7.intro.adoc
rename to doc/modules/ROOT/pages/8.examples/8.intro.adoc
diff --git a/doc/modules/ROOT/pages/7.examples/7a.hello-task.adoc b/doc/modules/ROOT/pages/8.examples/8a.hello-task.adoc
similarity index 86%
rename from doc/modules/ROOT/pages/7.examples/7a.hello-task.adoc
rename to doc/modules/ROOT/pages/8.examples/8a.hello-task.adoc
index 22eaa1703..4be42e699 100644
--- a/doc/modules/ROOT/pages/7.examples/7a.hello-task.adoc
+++ b/doc/modules/ROOT/pages/8.examples/8a.hello-task.adoc
@@ -20,9 +20,9 @@ The minimal Capy program: a task that prints a message.
 #include <boost/capy.hpp>
 #include <iostream>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
-task<> say_hello()
+capy::task<> say_hello()
 {
     std::cout << "Hello from Capy!\n";
     co_return;
@@ -30,8 +30,8 @@ task<> say_hello()
 
 int main()
 {
-    thread_pool pool;
-    run_async(pool.get_executor())(say_hello());
+    capy::thread_pool pool;
+    capy::run_async(pool.get_executor())(say_hello());
     return 0;
 }
 ----
@@ -50,7 +50,7 @@ target_link_libraries(hello_task PRIVATE capy)
 
 [source,cpp]
 ----
-task<> say_hello()
+capy::task<> say_hello()
 {
     std::cout << "Hello from Capy!\n";
     co_return;
@@ -65,7 +65,7 @@ Tasks are lazy: calling `say_hello()` creates a task object but does not execute
 
 [source,cpp]
 ----
-thread_pool pool;
+capy::thread_pool pool;
 ----
 
 `thread_pool` provides an execution context with worker threads. By default, it creates one thread per CPU core.
@@ -76,7 +76,7 @@ The pool's destructor waits for all work to complete before returning. This ensu
 
 [source,cpp]
 ----
-run_async(pool.get_executor())(say_hello());
+capy::run_async(pool.get_executor())(say_hello());
 ----
 
 `run_async` bridges non-coroutine code (like `main`) to coroutine code. The two-call syntax:
@@ -100,4 +100,4 @@ Hello from Capy!
 
 == Next Steps
 
-* xref:7.examples/7b.producer-consumer.adoc[Producer-Consumer] — Multiple tasks communicating
+* xref:8.examples/8b.producer-consumer.adoc[Producer-Consumer] — Multiple tasks communicating
diff --git a/doc/modules/ROOT/pages/7.examples/7b.producer-consumer.adoc b/doc/modules/ROOT/pages/8.examples/8b.producer-consumer.adoc
similarity index 76%
rename from doc/modules/ROOT/pages/7.examples/7b.producer-consumer.adoc
rename to doc/modules/ROOT/pages/8.examples/8b.producer-consumer.adoc
index 697b0f9af..935b264b8 100644
--- a/doc/modules/ROOT/pages/7.examples/7b.producer-consumer.adoc
+++ b/doc/modules/ROOT/pages/8.examples/8b.producer-consumer.adoc
@@ -11,7 +11,7 @@ Two tasks communicating via an async event, with strand serialization.
 
 == Prerequisites
 
-* Completed xref:7.examples/7a.hello-task.adoc[Hello Task]
+* Completed xref:8.examples/8a.hello-task.adoc[Hello Task]
 * Understanding of basic task creation and launching
 
 == Source Code
@@ -23,43 +23,44 @@ Two tasks communicating via an async event, with strand serialization.
 #include <iostream>
 #include <latch>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 int main()
 {
-    thread_pool pool;  // thread_pool
-    strand s{pool.get_executor()};  // strand - serializes execution
-    std::latch done(1);  // std::latch - wait for completion
+    capy::thread_pool pool;
+    capy::strand s{pool.get_executor()};
+    std::latch done(1);
 
-    auto on_complete = [&done](auto&&...) { done.count_down(); };  // lambda
-    auto on_error = [&done](std::exception_ptr) { done.count_down(); };  // lambda
+    auto on_complete = [&done](auto&&...) { done.count_down(); };
+    auto on_error = [&done](std::exception_ptr) { done.count_down(); };
 
-    async_event data_ready;  // async_event
-    int shared_value = 0;    // int
+    capy::async_event data_ready;
+    int shared_value = 0;
 
-    auto producer = [&]() -> task<> {
+    auto producer = [&]() -> capy::io_task<> {
         std::cout << "Producer: preparing data...\n";
         shared_value = 42;
         std::cout << "Producer: data ready, signaling\n";
         data_ready.set();
-        co_return;
+        co_return capy::io_result<>{};
     };
 
-    auto consumer = [&]() -> task<> {
+    auto consumer = [&]() -> capy::io_task<> {
         std::cout << "Consumer: waiting for data...\n";
-        co_await data_ready.wait();
+        auto [ec] = co_await data_ready.wait();
+        (void)ec;
         std::cout << "Consumer: received value " << shared_value << "\n";
-        co_return;
+        co_return capy::io_result<>{};
     };
 
     // Run both tasks concurrently using when_all, through a strand.
     // The strand serializes execution, ensuring thread-safe access
     // to the shared async_event and shared_value.
-    auto run_both = [&]() -> task<> {
-        co_await when_all(producer(), consumer());
+    auto run_both = [&]() -> capy::task<> {
+        (void) co_await capy::when_all(producer(), consumer());
     };
 
-    run_async(s, on_complete, on_error)(run_both());
+    capy::run_async(s, on_complete, on_error)(run_both());
 
     done.wait();  // Block until tasks complete
     return 0;
@@ -80,7 +81,7 @@ target_link_libraries(producer_consumer PRIVATE capy)
 
 [source,cpp]
 ----
-strand s{pool.get_executor()};  // strand - serializes execution
+capy::strand s{pool.get_executor()};
 ----
 
 A `strand` is an executor adaptor that serializes execution. All coroutines dispatched through a strand are guaranteed not to run concurrently, making it safe to access shared state without explicit locking. Note that `async_event` is not thread-safe, so using a strand ensures safe access.
@@ -89,7 +90,7 @@ A `strand` is an executor adaptor that serializes execution. All coroutines disp
 
 [source,cpp]
 ----
-async_event data_ready;  // async_event
+capy::async_event data_ready;
 ----
 
 `async_event` is a one-shot signaling mechanism. One task can `set()` it; other tasks can `wait()` for it. When set, all waiting tasks resume.
@@ -98,12 +99,12 @@ async_event data_ready;  // async_event
 
 [source,cpp]
 ----
-auto producer = [&]() -> task<> {
+auto producer = [&]() -> capy::io_task<> {
     std::cout << "Producer: preparing data...\n";
     shared_value = 42;
     std::cout << "Producer: data ready, signaling\n";
     data_ready.set();
-    co_return;
+    co_return capy::io_result<>{};
 };
 ----
 
@@ -113,11 +114,12 @@ The producer prepares data and signals completion by calling `set()`.
 
 [source,cpp]
 ----
-auto consumer = [&]() -> task<> {
+auto consumer = [&]() -> capy::io_task<> {
     std::cout << "Consumer: waiting for data...\n";
-    co_await data_ready.wait();
+    auto [ec] = co_await data_ready.wait();
+    (void)ec;
     std::cout << "Consumer: received value " << shared_value << "\n";
-    co_return;
+    co_return capy::io_result<>{};
 };
 ----
 
@@ -130,11 +132,11 @@ The consumer waits until the event is set. The `co_await data_ready.wait()` susp
 // Run both tasks concurrently using when_all, through a strand.
 // The strand serializes execution, ensuring thread-safe access
 // to the shared async_event and shared_value.
-auto run_both = [&]() -> task<> {
-    co_await when_all(producer(), consumer());
+auto run_both = [&]() -> capy::task<> {
+    (void) co_await capy::when_all(producer(), consumer());
 };
 
-run_async(s, on_complete, on_error)(run_both());
+capy::run_async(s, on_complete, on_error)(run_both());
 ----
 
 `when_all` runs both tasks concurrently within the same parent coroutine context, but the strand ensures they don't run at the same time on different threads. The producer signals `data_ready` when the value is set, and the consumer waits for the signal before reading.
@@ -171,4 +173,4 @@ Consumer: received value 42
 
 == Next Steps
 
-* xref:7.examples/7c.buffer-composition.adoc[Buffer Composition] — Zero-allocation buffer composition
+* xref:8.examples/8c.buffer-composition.adoc[Buffer Composition] — Zero-allocation buffer composition
diff --git a/doc/modules/ROOT/pages/7.examples/7c.buffer-composition.adoc b/doc/modules/ROOT/pages/8.examples/8c.buffer-composition.adoc
similarity index 61%
rename from doc/modules/ROOT/pages/7.examples/7c.buffer-composition.adoc
rename to doc/modules/ROOT/pages/8.examples/8c.buffer-composition.adoc
index b8410c1c2..4109a978d 100644
--- a/doc/modules/ROOT/pages/7.examples/7c.buffer-composition.adoc
+++ b/doc/modules/ROOT/pages/8.examples/8c.buffer-composition.adoc
@@ -5,12 +5,12 @@ Composing buffer sequences without allocation for scatter/gather I/O.
 == What You Will Learn
 
 * Creating buffers from different sources
-* Using `const_buffer_pair` and `mutable_buffer_pair` for scatter/gather I/O
+* Using `std::array<const_buffer, N>` and `std::array<mutable_buffer, N>` for scatter/gather I/O
 * Zero-allocation buffer sequence patterns
 
 == Prerequisites
 
-* Completed xref:7.examples/7b.producer-consumer.adoc[Producer-Consumer]
+* Completed xref:8.examples/8b.producer-consumer.adoc[Producer-Consumer]
 * Understanding of buffer types from xref:../5.buffers/5b.types.adoc[Buffer Types]
 
 == Source Code
@@ -23,7 +23,7 @@ Composing buffer sequences without allocation for scatter/gather I/O.
 #include <array>
 #include <vector>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 void demonstrate_single_buffers()
 {
@@ -35,32 +35,32 @@ void demonstrate_single_buffers()
     std::vector<char> vec = {'V', 'e', 'c', 't', 'o', 'r'};
     
     // make_buffer creates buffer views (no copies)
-    auto str_buf = make_buffer(str);  // mutable_buffer
-    auto arr_buf = make_buffer(arr, sizeof(arr) - 1);  // mutable_buffer - Exclude null terminator
-    auto vec_buf = make_buffer(vec);  // mutable_buffer
+    auto str_buf = capy::make_buffer(str);  // mutable_buffer
+    auto arr_buf = capy::make_buffer(arr, sizeof(arr) - 1);  // mutable_buffer - Exclude null terminator
+    auto vec_buf = capy::make_buffer(vec);  // mutable_buffer
     
     std::cout << "String buffer: " << str_buf.size() << " bytes\n";
     std::cout << "Array buffer:  " << arr_buf.size() << " bytes\n";
     std::cout << "Vector buffer: " << vec_buf.size() << " bytes\n";
 }
 
-void demonstrate_buffer_pair()
+void demonstrate_two_buffer_scatter()
 {
-    std::cout << "\n=== Buffer Pair (Scatter/Gather) ===\n\n";
-    
-    // const_buffer_pair is std::array<const_buffer, 2>
+    std::cout << "\n=== Two-Buffer Scatter/Gather ===\n\n";
+
+    // A 2-element buffer sequence is just a std::array
     std::string header = "Content-Type: text/plain\r\n\r\n";
     std::string body = "Hello, World!";
-    
-    const_buffer_pair message = {{
-        make_buffer(header),
-        make_buffer(body)
+
+    std::array<capy::const_buffer, 2> message = {{
+        capy::make_buffer(header),
+        capy::make_buffer(body)
     }};
     
     // Calculate total size
-    std::size_t total = buffer_size(message);
+    std::size_t total = capy::buffer_size(message);
     std::cout << "Total message size: " << total << " bytes\n";
-    std::cout << "Buffer count: " << buffer_length(message) << "\n";
+    std::cout << "Buffer count: " << capy::buffer_length(message) << "\n";
     
     // Iterate through buffers
     std::cout << "\nBuffer contents:\n";
@@ -83,17 +83,17 @@ void demonstrate_buffer_array()
     std::string empty_line = "\r\n";
     std::string body = R"({"status":"ok"})";
     
-    std::array<const_buffer, 5> http_response = {{
-        make_buffer(status),
-        make_buffer(content_type),
-        make_buffer(server),
-        make_buffer(empty_line),
-        make_buffer(body)
+    std::array<capy::const_buffer, 5> http_response = {{
+        capy::make_buffer(status),
+        capy::make_buffer(content_type),
+        capy::make_buffer(server),
+        capy::make_buffer(empty_line),
+        capy::make_buffer(body)
     }};
     
-    std::size_t total = buffer_size(http_response);
+    std::size_t total = capy::buffer_size(http_response);
     std::cout << "HTTP response size: " << total << " bytes\n";
-    std::cout << "Buffer count: " << buffer_length(http_response) << "\n";
+    std::cout << "Buffer count: " << capy::buffer_length(http_response) << "\n";
     
     // In real code with streams:
     // co_await write(stream, http_response);
@@ -108,13 +108,13 @@ void demonstrate_mutable_buffers()
     char buf1[64];
     char buf2[64];
     
-    mutable_buffer_pair recv_buffers = {{
-        mutable_buffer(buf1, sizeof(buf1)),
-        mutable_buffer(buf2, sizeof(buf2))
+    std::array<capy::mutable_buffer, 2> recv_buffers = {{
+        capy::mutable_buffer(buf1, sizeof(buf1)),
+        capy::mutable_buffer(buf2, sizeof(buf2))
     }};
     
-    std::cout << "Prepared " << buffer_length(recv_buffers) 
-              << " buffers with " << buffer_size(recv_buffers) 
+    std::cout << "Prepared " << capy::buffer_length(recv_buffers) 
+              << " buffers with " << capy::buffer_size(recv_buffers) 
               << " bytes total capacity\n";
     
     // In real code:
@@ -124,7 +124,7 @@ void demonstrate_mutable_buffers()
 int main()
 {
     demonstrate_single_buffers();
-    demonstrate_buffer_pair();
+    demonstrate_two_buffer_scatter();
     demonstrate_buffer_array();
     demonstrate_mutable_buffers();
     
@@ -146,36 +146,36 @@ target_link_libraries(buffer_composition PRIVATE capy)
 
 [source,cpp]
 ----
-auto str_buf = make_buffer(str);  // mutable_buffer
-auto arr_buf = make_buffer(arr, sizeof(arr) - 1);  // mutable_buffer
+auto str_buf = capy::make_buffer(str);  // mutable_buffer
+auto arr_buf = capy::make_buffer(arr, sizeof(arr) - 1);  // mutable_buffer
 ----
 
 `make_buffer` creates buffer views from various sources. No data is copied—the buffers reference the original storage.
 
-=== Buffer Pairs
+=== Two-Buffer Scatter/Gather
 
 [source,cpp]
 ----
-const_buffer_pair message = {{
-    make_buffer(header),
-    make_buffer(body)
+std::array<capy::const_buffer, 2> message = {{
+    capy::make_buffer(header),
+    capy::make_buffer(body)
 }};
 ----
 
-`const_buffer_pair` is `std::array<const_buffer, 2>` — a fixed-size buffer sequence for scatter/gather I/O. Similarly, `mutable_buffer_pair` holds two mutable buffers.
+Capy's buffer-sequence concepts accept any range of `const_buffer` or `mutable_buffer`, so `std::array<X, 2>` is a buffer sequence with no further wrapping required. Use `mutable_buffer` for receive paths.
 
 === Multi-Buffer Arrays
 
 [source,cpp]
 ----
-std::array<const_buffer, 5> http_response = {{
-    make_buffer(status),
-    make_buffer(content_type),
+std::array<capy::const_buffer, 5> http_response = {{
+    capy::make_buffer(status),
+    capy::make_buffer(content_type),
     // ...
 }};
 ----
 
-For more than two buffers, use `std::array` directly. Buffer sequences support `buffer_size()` and `buffer_length()` for querying total bytes and buffer count.
+For more than two buffers, the same pattern works with a larger `std::array`. Buffer sequences support `buffer_size()` and `buffer_length()` for querying total bytes and buffer count.
 
 === Scatter/Gather I/O
 
@@ -199,7 +199,7 @@ String buffer: 13 bytes
 Array buffer:  10 bytes
 Vector buffer: 6 bytes
 
-=== Buffer Pair (Scatter/Gather) ===
+=== Two-Buffer Scatter/Gather ===
 
 Total message size: 41 bytes
 Buffer count: 2
@@ -223,9 +223,8 @@ Prepared 2 buffers with 128 bytes total capacity
 == Exercises
 
 1. Create a function that takes any `ConstBufferSequence` and prints its contents
-2. Measure the difference between copying data into a single buffer vs. using `cat()`
-3. Implement a simple message framing protocol using buffer composition
+2. Implement a simple message framing protocol using buffer composition
 
 == Next Steps
 
-* xref:7.examples/7d.mock-stream-testing.adoc[Mock Stream Testing] — Unit testing with mock streams
+* xref:8.examples/8d.mock-stream-testing.adoc[Mock Stream Testing] — Unit testing with mock streams
diff --git a/doc/modules/ROOT/pages/7.examples/7d.mock-stream-testing.adoc b/doc/modules/ROOT/pages/8.examples/8d.mock-stream-testing.adoc
similarity index 69%
rename from doc/modules/ROOT/pages/7.examples/7d.mock-stream-testing.adoc
rename to doc/modules/ROOT/pages/8.examples/8d.mock-stream-testing.adoc
index 287d14f7e..4d7fc39c0 100644
--- a/doc/modules/ROOT/pages/7.examples/7d.mock-stream-testing.adoc
+++ b/doc/modules/ROOT/pages/8.examples/8d.mock-stream-testing.adoc
@@ -10,7 +10,7 @@ Unit testing protocol code with mock streams and error injection.
 
 == Prerequisites
 
-* Completed xref:7.examples/7c.buffer-composition.adoc[Buffer Composition]
+* Completed xref:8.examples/8c.buffer-composition.adoc[Buffer Composition]
 * Understanding of streams from xref:../6.streams/6b.streams.adoc[Streams]
 
 == Source Code
@@ -26,32 +26,34 @@ Unit testing protocol code with mock streams and error injection.
 #include <cassert>
 #include <cctype>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 // A simple protocol: read until newline, echo back uppercase
 // Takes any_stream& so the function is transport-independent
-task<bool> echo_line_uppercase(any_stream& stream)
+capy::task<bool> echo_line_uppercase(capy::any_stream& stream)
 {
     std::string line;
     char c;
-    
+
     // Read character by character until newline
     while (true)
     {
         // ec: std::error_code, n: std::size_t
-        auto [ec, n] = co_await stream.read_some(mutable_buffer(&c, 1));
-        
+        auto [ec, n] = co_await stream.read_some(capy::mutable_buffer(&c, 1));
+
+        if (n > 0)
+        {
+            if (c == '\n')
+                break;
+            line += static_cast<char>(std::toupper(static_cast<unsigned char>(c)));
+        }
+
         if (ec)
         {
-            if (ec == cond::eof)
+            if (ec == capy::cond::eof)
                 break;
             co_return false;
         }
-        
-        if (c == '\n')
-            break;
-        
-        line += static_cast<char>(std::toupper(static_cast<unsigned char>(c)));
     }
     
     line += '\n';
@@ -62,12 +64,12 @@ task<bool> echo_line_uppercase(any_stream& stream)
     {
         // wec: std::error_code, wn: std::size_t
         auto [wec, wn] = co_await stream.write_some(
-            const_buffer(line.data() + written, line.size() - written));
-        
+            capy::const_buffer(line.data() + written, line.size() - written));
+
+        written += wn;
+
         if (wec)
             co_return false;
-        
-        written += wn;
     }
     
     co_return true;
@@ -77,19 +79,16 @@ void test_happy_path()
 {
     std::cout << "Test: happy path\n";
     
-    // Use fuse in disarmed mode (no error injection) for happy path
-    test::fuse f;  // test::fuse
-    test::stream mock(f);  // test::stream
-    mock.provide("hello\n");
-    
-    // Wrap mock in any_stream using pointer construction for reference semantics
-    any_stream stream{&mock};  // any_stream
-    
+    auto [a, b] = capy::test::make_stream_pair();
+    b.provide("hello\n");
+
+    capy::any_stream stream{&a};  // any_stream
+
     bool result = false;  // bool
-    test::run_blocking([&](bool r) { result = r; })(echo_line_uppercase(stream));
+    capy::test::run_blocking([&](bool r) { result = r; })(echo_line_uppercase(stream));
     
     assert(result == true);
-    assert(mock.data() == "HELLO\n");
+    assert(b.data() == "HELLO\n");
     
     std::cout << "  PASSED\n";
 }
@@ -98,20 +97,17 @@ void test_partial_reads()
 {
     std::cout << "Test: partial reads (1 byte at a time)\n";
     
-    // Use fuse in disarmed mode (no error injection)
-    test::fuse f;  // test::fuse
-    // Mock returns at most 1 byte per read_some
-    test::stream mock(f, 1);  // test::stream, max_read_size = 1
-    mock.provide("hi\n");
-    
-    // Wrap mock in any_stream using pointer construction for reference semantics
-    any_stream stream{&mock};  // any_stream
-    
+    auto [a, b] = capy::test::make_stream_pair();
+    a.set_max_read_size(1);
+    b.provide("hi\n");
+
+    capy::any_stream stream{&a};  // any_stream
+
     bool result = false;  // bool
-    test::run_blocking([&](bool r) { result = r; })(echo_line_uppercase(stream));
+    capy::test::run_blocking([&](bool r) { result = r; })(echo_line_uppercase(stream));
     
     assert(result == true);
-    assert(mock.data() == "HI\n");
+    assert(b.data() == "HI\n");
     
     std::cout << "  PASSED\n";
 }
@@ -125,13 +121,12 @@ void test_with_error_injection()
     
     // fuse::armed runs the test repeatedly, failing at each
     // operation point until all paths are covered
-    test::fuse f;  // test::fuse
-    auto r = f.armed([&](test::fuse&) -> task<> {  // fuse::result
-        test::stream mock(f);  // test::stream
-        mock.provide("test\n");
+    capy::test::fuse f;  // test::fuse
+    auto r = f.armed([&](capy::test::fuse&) -> capy::task<> {  // fuse::result
+        auto [a, b] = capy::test::make_stream_pair(f);
+        b.provide("test\n");
         
-        // Wrap mock in any_stream using pointer construction for reference semantics
-        any_stream stream{&mock};  // any_stream
+        capy::any_stream stream{&a};  // any_stream
         
         // Run the protocol - fuse will inject errors at each step
         bool result = co_await echo_line_uppercase(stream);  // bool
@@ -140,7 +135,7 @@ void test_with_error_injection()
         if (result)
         {
             ++success_count;
-            assert(mock.data() == "TEST\n");
+            assert(b.data() == "TEST\n");
         }
         else
         {
@@ -185,8 +180,8 @@ target_link_libraries(mock_stream_testing PRIVATE capy)
 
 [source,cpp]
 ----
-test::fuse f;  // test::fuse
-test::stream mock(f);  // test::stream
+capy::test::fuse f;  // test::fuse
+capy::test::stream mock(f);  // test::stream
 mock.provide("hello\n");
 ----
 
@@ -202,7 +197,7 @@ mock.provide("hello\n");
 [source,cpp]
 ----
 // Wrap mock in any_stream using pointer construction for reference semantics
-any_stream stream{&mock};  // any_stream
+capy::any_stream stream{&mock};  // any_stream
 ----
 
 Use pointer construction (`&mock`) so the `any_stream` wrapper references the mock without taking ownership. This allows inspecting `mock.data()` after operations.
@@ -212,7 +207,7 @@ Use pointer construction (`&mock`) so the `any_stream` wrapper references the mo
 [source,cpp]
 ----
 bool result = false;  // bool
-test::run_blocking([&](bool r) { result = r; })(echo_line_uppercase(stream));
+capy::test::run_blocking([&](bool r) { result = r; })(echo_line_uppercase(stream));
 ----
 
 `run_blocking` executes a coroutine synchronously, blocking until complete. Pass a handler to capture the result.
@@ -221,9 +216,9 @@ test::run_blocking([&](bool r) { result = r; })(echo_line_uppercase(stream));
 
 [source,cpp]
 ----
-test::fuse f;  // test::fuse
-auto r = f.armed([&](test::fuse&) -> task<> {
-    test::stream mock(f);  // test::stream
+capy::test::fuse f;  // test::fuse
+auto r = f.armed([&](capy::test::fuse&) -> capy::task<> {
+    capy::test::stream mock(f);  // test::stream
     // ... run test ...
 });
 ----
@@ -258,4 +253,4 @@ All tests passed!
 
 == Next Steps
 
-* xref:7.examples/7e.type-erased-echo.adoc[Type-Erased Echo] — Compilation firewall pattern
+* xref:8.examples/8e.type-erased-echo.adoc[Type-Erased Echo] — Compilation firewall pattern
diff --git a/doc/modules/ROOT/pages/7.examples/7e.type-erased-echo.adoc b/doc/modules/ROOT/pages/8.examples/8e.type-erased-echo.adoc
similarity index 70%
rename from doc/modules/ROOT/pages/7.examples/7e.type-erased-echo.adoc
rename to doc/modules/ROOT/pages/8.examples/8e.type-erased-echo.adoc
index afdd68570..61f01f2a0 100644
--- a/doc/modules/ROOT/pages/7.examples/7e.type-erased-echo.adoc
+++ b/doc/modules/ROOT/pages/8.examples/8e.type-erased-echo.adoc
@@ -10,7 +10,7 @@ Echo server demonstrating the compilation firewall pattern.
 
 == Prerequisites
 
-* Completed xref:7.examples/7d.mock-stream-testing.adoc[Mock Stream Testing]
+* Completed xref:8.examples/8d.mock-stream-testing.adoc[Mock Stream Testing]
 * Understanding of type erasure from xref:../6.streams/6f.isolation.adoc[Physical Isolation]
 
 == Source Code
@@ -47,30 +47,23 @@ boost::capy::task<> echo_session(boost::capy::any_stream& stream);
 
 namespace myapp {
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
-task<> echo_session(any_stream& stream)
+capy::task<> echo_session(capy::any_stream& stream)
 {
     char buffer[1024];
-    
+
     for (;;)
     {
-        // Read some data
-        // ec: std::error_code, n: std::size_t
-        auto [ec, n] = co_await stream.read_some(make_buffer(buffer));
-        
-        if (ec == cond::eof)
-            co_return;  // Client closed connection
-        
+        auto [ec, n] = co_await stream.read_some(capy::make_buffer(buffer));
+
+        auto [wec, wn] = co_await capy::write(stream, capy::const_buffer(buffer, n));
+
         if (ec)
-            throw std::system_error(ec);
-        
-        // Echo it back
-        // wec: std::error_code, wn: std::size_t
-        auto [wec, wn] = co_await write(stream, const_buffer(buffer, n));
-        
+            co_return;
+
         if (wec)
-            throw std::system_error(wec);
+            co_return;
     }
 }
 
@@ -88,22 +81,21 @@ task<> echo_session(any_stream& stream)
 #include <boost/capy/test/run_blocking.hpp>
 #include <iostream>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 void test_with_mock()
 {
-    test::fuse f;
-    test::stream mock(f);
-    mock.provide("Hello, ");
-    mock.provide("World!\n");
-    // Stream returns eof when no more data is available
+    auto [a, b] = capy::test::make_stream_pair();
+    b.provide("Hello, ");
+    b.provide("World!\n");
+    b.close();
     
-    // Using pointer construction (&mock) for reference semantics - the
-    // wrapper does not take ownership, so mock must outlive stream.
-    any_stream stream{&mock};  // any_stream
-    test::run_blocking()(myapp::echo_session(stream));
+    // Using pointer construction (&a) for reference semantics - the
+    // wrapper does not take ownership, so a must outlive stream.
+    capy::any_stream stream{&a};  // any_stream
+    capy::test::run_blocking()(myapp::echo_session(stream));
     
-    std::cout << "Echo output: " << mock.data() << "\n";
+    std::cout << "Echo output: " << b.data() << "\n";
 }
 
 // With real sockets (using Corosio), you would write:
@@ -140,7 +132,7 @@ target_link_libraries(echo_demo PRIVATE echo_lib)
 [source,cpp]
 ----
 // echo.hpp
-task<> echo_session(any_stream& stream);
+boost::capy::task<> echo_session(boost::capy::any_stream& stream);
 ----
 
 The header declares only the signature. It includes `any_stream` and `task`, but no concrete transport types.
@@ -156,7 +148,7 @@ Clients of this header:
 [source,cpp]
 ----
 // echo.cpp
-task<> echo_session(any_stream& stream)
+capy::task<> echo_session(capy::any_stream& stream)
 {
     // Full implementation here
 }
@@ -192,4 +184,4 @@ Echo output: Hello, World!
 
 == Next Steps
 
-* xref:7.examples/7f.timeout-cancellation.adoc[Timeout with Cancellation] — Stop tokens for timeout
+* xref:8.examples/8f.timeout-cancellation.adoc[Timeout with Cancellation] — Stop tokens for timeout
diff --git a/doc/modules/ROOT/pages/7.examples/7f.timeout-cancellation.adoc b/doc/modules/ROOT/pages/8.examples/8f.timeout-cancellation.adoc
similarity index 84%
rename from doc/modules/ROOT/pages/7.examples/7f.timeout-cancellation.adoc
rename to doc/modules/ROOT/pages/8.examples/8f.timeout-cancellation.adoc
index fa55a43bf..4bd6d9a3d 100644
--- a/doc/modules/ROOT/pages/7.examples/7f.timeout-cancellation.adoc
+++ b/doc/modules/ROOT/pages/8.examples/8f.timeout-cancellation.adoc
@@ -10,7 +10,7 @@ Using stop tokens to implement operation timeouts.
 
 == Prerequisites
 
-* Completed xref:7.examples/7e.type-erased-echo.adoc[Type-Erased Echo]
+* Completed xref:8.examples/8e.type-erased-echo.adoc[Type-Erased Echo]
 * Understanding of stop tokens from xref:../4.coroutines/4e.cancellation.adoc[Cancellation]
 
 == Source Code
@@ -18,6 +18,7 @@ Using stop tokens to implement operation timeouts.
 [source,cpp]
 ----
 #include <boost/capy.hpp>
+#include <boost/capy/ex/this_coro.hpp>
 #include <boost/capy/test/stream.hpp>
 #include <boost/capy/test/run_blocking.hpp>
 #include <chrono>
@@ -25,12 +26,12 @@ Using stop tokens to implement operation timeouts.
 #include <latch>
 #include <thread>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 // A slow operation that respects cancellation
-task<std::string> slow_fetch(int steps)
+capy::task<std::string> slow_fetch(int steps)
 {
-    auto token = co_await this_coro::stop_token;  // std::stop_token
+    auto token = co_await capy::this_coro::stop_token;  // std::stop_token
     std::string result;
     
     for (int i = 0; i < steps; ++i)
@@ -50,6 +51,7 @@ task<std::string> slow_fetch(int steps)
         std::cout << "  Completed step " << i << std::endl;
         
         // Yield to allow stop request to be processed before next check
+        // Extra 5ms ensures print completes before main thread prints
         std::this_thread::sleep_for(std::chrono::milliseconds(15));
     }
     
@@ -57,9 +59,9 @@ task<std::string> slow_fetch(int steps)
 }
 
 // Run with timeout (conceptual - real implementation needs timer)
-task<std::optional<std::string>> fetch_with_timeout()
+capy::task<std::optional<std::string>> fetch_with_timeout()
 {
-    auto token = co_await this_coro::stop_token;  // std::stop_token
+    auto token = co_await capy::this_coro::stop_token;  // std::stop_token
     
     try
     {
@@ -78,11 +80,11 @@ void demo_normal_completion()
 {
     std::cout << "Demo: Normal completion\n";
     
-    thread_pool pool;
+    capy::thread_pool pool;
     std::stop_source source;
     std::latch done(1);  // std::latch - wait for 1 task
-    
-    run_async(pool.get_executor(), source.get_token(),
+
+    capy::run_async(pool.get_executor(), source.get_token(),
         [&done](std::optional<std::string> result) {
             if (result)
                 std::cout << "Result: " << *result << "\n";
@@ -92,20 +94,20 @@ void demo_normal_completion()
         },
         [&done](std::exception_ptr) { done.count_down(); }
     )(fetch_with_timeout());
-    
+
     done.wait();  // Block until task completes
 }
 
 void demo_cancellation()
 {
     std::cout << "\nDemo: Cancellation after 2 steps\n";
-    
-    thread_pool pool;
+
+    capy::thread_pool pool;
     std::stop_source source;
     std::latch done(1);  // std::latch - wait for 1 task
-    
+
     // Launch the task
-    run_async(pool.get_executor(), source.get_token(),
+    capy::run_async(pool.get_executor(), source.get_token(),
         [&done](std::optional<std::string> result) {
             if (result)
                 std::cout << "Result: " << *result << "\n";
@@ -118,6 +120,7 @@ void demo_cancellation()
     
     // Simulate timeout: cancel after 2 steps complete
     // Timing: each step is 10ms work + 15ms yield = 25ms total
+    // Step 1 prints at 35ms, step 2 check at 50ms
     // Stop at 42ms: after step 1 print, before step 2 check
     std::this_thread::sleep_for(std::chrono::milliseconds(42));
     std::cout << "  Requesting stop..." << std::endl;
@@ -127,9 +130,9 @@ void demo_cancellation()
 }
 
 // Example: Manual stop token checking
-task<int> process_items(std::vector<int> const& items)
+capy::task<int> process_items(std::vector<int> const& items)
 {
-    auto token = co_await this_coro::stop_token;  // std::stop_token
+    auto token = co_await capy::this_coro::stop_token;  // std::stop_token
     int sum = 0;
     
     for (auto item : items)  // int
@@ -169,7 +172,7 @@ target_link_libraries(timeout_cancellation PRIVATE capy)
 
 [source,cpp]
 ----
-auto token = co_await this_coro::stop_token;  // std::stop_token
+auto token = co_await capy::this_coro::stop_token;  // std::stop_token
 ----
 
 Inside a task, `this_coro::stop_token` retrieves the stop token propagated from the caller. You can also access it through the full environment via `co_await this_coro::environment`.
@@ -191,7 +194,7 @@ Check `stop_requested()` at appropriate points—typically before expensive oper
 [source,cpp]
 ----
 std::stop_source source;
-run_async(ex, source.get_token())(my_task());
+capy::run_async(ex, source.get_token())(my_task());
 
 // Later:
 source.request_stop();
@@ -238,4 +241,4 @@ Cancelled (returned nullopt)
 
 == Next Steps
 
-* xref:7.examples/7g.parallel-fetch.adoc[Parallel Fetch] — Concurrent operations with when_all
+* xref:8.examples/8g.parallel-fetch.adoc[Parallel Fetch] — Concurrent operations with when_all
diff --git a/doc/modules/ROOT/pages/7.examples/7g.parallel-fetch.adoc b/doc/modules/ROOT/pages/8.examples/8g.parallel-fetch.adoc
similarity index 57%
rename from doc/modules/ROOT/pages/7.examples/7g.parallel-fetch.adoc
rename to doc/modules/ROOT/pages/8.examples/8g.parallel-fetch.adoc
index db92a0729..9efdc0492 100644
--- a/doc/modules/ROOT/pages/7.examples/7g.parallel-fetch.adoc
+++ b/doc/modules/ROOT/pages/8.examples/8g.parallel-fetch.adoc
@@ -10,7 +10,7 @@ Running multiple operations concurrently with `when_all`.
 
 == Prerequisites
 
-* Completed xref:7.examples/7f.timeout-cancellation.adoc[Timeout with Cancellation]
+* Completed xref:8.examples/8f.timeout-cancellation.adoc[Timeout with Cancellation]
 * Understanding of `when_all` from xref:../4.coroutines/4f.composition.adoc[Composition]
 
 == Source Code
@@ -21,53 +21,59 @@ Running multiple operations concurrently with `when_all`.
 #include <iostream>
 #include <latch>
 #include <string>
+#include <vector>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 // Simulated async operations
-task<int> fetch_user_id(std::string username)
+capy::task<int> fetch_user_id(std::string username)
 {
     std::cout << "Fetching user ID for: " << username << "\n";
     // In real code: co_await http_get("/users/" + username);
     co_return static_cast<int>(username.length()) * 100;  // Fake ID
 }
 
-task<std::string> fetch_user_name(int id)
+capy::task<std::string> fetch_user_name(int id)
 {
     std::cout << "Fetching name for user ID: " << id << "\n";
     co_return "User" + std::to_string(id);
 }
 
-task<int> fetch_order_count(int user_id)
+capy::task<int> fetch_order_count(int user_id)
 {
     std::cout << "Fetching order count for user: " << user_id << "\n";
     co_return user_id / 10;  // Fake count
 }
 
-task<double> fetch_account_balance(int user_id)
+capy::task<double> fetch_account_balance(int user_id)
 {
     std::cout << "Fetching balance for user: " << user_id << "\n";
     co_return user_id * 1.5;  // Fake balance
 }
 
-// Fetch all user data in parallel
-task<> fetch_user_dashboard(std::string username)
+// Fetch all user data in parallel using variadic when_all.
+// Heterogeneous return types are flattened into the result.
+capy::task<> fetch_user_dashboard(std::string username)
 {
     std::cout << "\n=== Fetching dashboard for: " << username << " ===\n";
-    
+
     // First, get the user ID (needed for other queries)
     int user_id = co_await fetch_user_id(username);
     std::cout << "Got user ID: " << user_id << "\n\n";
-    
-    // Now fetch all user data in parallel
+
+    // when_all requires io_task children. Wrap plain tasks:
     std::cout << "Starting parallel fetches...\n";
-    // name: std::string, orders: int, balance: double
-    auto [name, orders, balance] = co_await when_all(
-        fetch_user_name(user_id),
-        fetch_order_count(user_id),
-        fetch_account_balance(user_id)
-    );
-    
+
+    auto wrap = [](auto inner) -> capy::io_task<decltype(inner.await_resume())> {
+        co_return capy::io_result<decltype(inner.await_resume())>{
+            {}, co_await std::move(inner)};
+    };
+
+    auto [ec, name, orders, balance] = co_await capy::when_all(
+        wrap(fetch_user_name(user_id)),
+        wrap(fetch_order_count(user_id)),
+        wrap(fetch_account_balance(user_id)));
+
     std::cout << "\nDashboard results:\n";
     std::cout << "  Name: " << name << "\n";
     std::cout << "  Orders: " << orders << "\n";
@@ -75,61 +81,60 @@ task<> fetch_user_dashboard(std::string username)
 }
 
 // Example with void tasks
-task<> log_access(std::string resource)
+capy::io_task<> log_access(std::string resource)
 {
     std::cout << "Logging access to: " << resource << "\n";
-    co_return;
+    co_return capy::io_result<>{};
 }
 
-task<> update_metrics(std::string metric)
+capy::io_task<> update_metrics(std::string metric)
 {
     std::cout << "Updating metric: " << metric << "\n";
-    co_return;
+    co_return capy::io_result<>{};
 }
 
-task<std::string> fetch_with_side_effects()
+capy::task<std::string> fetch_with_side_effects()
 {
     std::cout << "\n=== Fetch with side effects ===\n";
-    
-    // void tasks don't contribute to result tuple
-    std::tuple<std::string> results = co_await when_all(
-        log_access("api/data"),           // void - no result
-        update_metrics("api_calls"),      // void - no result
-        fetch_user_name(42)               // returns string
-    );
-    std::string data = std::get<0>(results);  // std::string
-    
+
+    auto r = co_await capy::when_all(
+        log_access("api/data"),
+        update_metrics("api_calls"));
+    if (r.ec)
+        co_return "error";
+
+    auto data = co_await fetch_user_name(42);
+
     std::cout << "Data: " << data << "\n";
     co_return data;
 }
 
 // Error handling example
-task<int> might_fail(bool should_fail, std::string name)
+capy::io_task<int> might_fail(bool should_fail, std::string name)
 {
     std::cout << "Task " << name << " starting\n";
-    
+
     if (should_fail)
     {
         throw std::runtime_error(name + " failed!");
     }
-    
+
     std::cout << "Task " << name << " completed\n";
-    co_return 42;
+    co_return capy::io_result<int>{{}, 42};
 }
 
-task<> demonstrate_error_handling()
+capy::task<> demonstrate_error_handling()
 {
     std::cout << "\n=== Error handling ===\n";
-    
+
     try
     {
-        // a: int, b: int, c: int
-        auto [a, b, c] = co_await when_all(
+        auto [ec2, a, b, c] = co_await capy::when_all(
             might_fail(false, "A"),
             might_fail(true, "B"),   // This one fails
-            might_fail(false, "C")
-        );
-        std::cout << "All succeeded: " << a << ", " << b << ", " << c << "\n";
+            might_fail(false, "C"));
+        std::cout << "All succeeded: " << a << ", "
+                  << b << ", " << c << "\n";
     }
     catch (std::runtime_error const& e)
     {
@@ -141,19 +146,18 @@ task<> demonstrate_error_handling()
 
 int main()
 {
-    thread_pool pool;
-    
+    capy::thread_pool pool;
     std::latch done(3);  // std::latch - wait for 3 tasks
-    
+
     // Completion handlers signal the latch when each task finishes
     // Use generic lambda to accept any result type (or no result for task<void>)
     auto on_complete = [&done](auto&&...) { done.count_down(); };
     auto on_error = [&done](std::exception_ptr) { done.count_down(); };
-    
-    run_async(pool.get_executor(), on_complete, on_error)(fetch_user_dashboard("alice"));
-    run_async(pool.get_executor(), on_complete, on_error)(fetch_with_side_effects());
-    run_async(pool.get_executor(), on_complete, on_error)(demonstrate_error_handling());
-    
+
+    capy::run_async(pool.get_executor(), on_complete, on_error)(fetch_user_dashboard("alice"));
+    capy::run_async(pool.get_executor(), on_complete, on_error)(fetch_with_side_effects());
+    capy::run_async(pool.get_executor(), on_complete, on_error)(demonstrate_error_handling());
+
     done.wait();  // Block until all tasks complete
     return 0;
 }
@@ -173,28 +177,26 @@ target_link_libraries(parallel_fetch PRIVATE capy)
 
 [source,cpp]
 ----
-auto [name, orders, balance] = co_await when_all(
-    fetch_user_name(user_id),
-    fetch_order_count(user_id),
-    fetch_account_balance(user_id)
-);
+auto [ec, name, orders, balance] = co_await capy::when_all(
+    wrap(fetch_user_name(user_id)),
+    wrap(fetch_order_count(user_id)),
+    wrap(fetch_account_balance(user_id)));
 ----
 
-All three tasks run concurrently. `when_all` completes when all tasks finish. Results are returned in a tuple matching input order.
+`when_all` requires children returning `io_result`, so plain tasks are wrapped. All three run concurrently. The result is `io_result<std::string, int, double>`, a single `ec` plus the flattened payloads in input order.
 
-=== Void Filtering
+=== Void io_tasks
 
 [source,cpp]
 ----
-std::tuple<std::string> results = co_await when_all(
-    log_access("api/data"),      // void - filtered out
-    update_metrics("api_calls"), // void - filtered out
-    fetch_user_name(42)          // string - in tuple
-);
-std::string data = std::get<0>(results);  // std::string
+auto r = co_await capy::when_all(
+    log_access("api/data"),
+    update_metrics("api_calls"));
+if (r.ec)
+    co_return "error";
 ----
 
-Tasks returning `void` don't contribute to the result tuple. Only non-void results appear.
+`io_task<>` children return `io_result<>` (just an error code, no payload). Check `r.ec` to detect failure.
 
 === Error Propagation
 
@@ -202,7 +204,10 @@ Tasks returning `void` don't contribute to the result tuple. Only non-void resul
 ----
 try
 {
-    auto results = co_await when_all(task_a(), task_b(), task_c());
+    auto [ec2, a, b, c] = co_await capy::when_all(
+        might_fail(false, "A"),
+        might_fail(true, "B"),
+        might_fail(false, "C"));
 }
 catch (...)
 {
@@ -211,12 +216,7 @@ catch (...)
 }
 ----
 
-When a task throws:
-
-1. The exception is captured
-2. Stop is requested for siblings
-3. All tasks complete (or respond to stop)
-4. First exception is rethrown
+I/O errors are reported via `ec` in the `io_result`. Thrown exceptions are captured separately — Upon error cancellation is requested and the first exception is rethrown after all tasks complete.
 
 == Output
 
@@ -258,4 +258,4 @@ Caught error: B failed!
 
 == Next Steps
 
-* xref:7.examples/7h.custom-dynamic-buffer.adoc[Custom Dynamic Buffer] — Implementing your own buffer
+* xref:8.examples/8h.custom-dynamic-buffer.adoc[Custom Dynamic Buffer] — Implementing your own buffer
diff --git a/doc/modules/ROOT/pages/7.examples/7h.custom-dynamic-buffer.adoc b/doc/modules/ROOT/pages/8.examples/8h.custom-dynamic-buffer.adoc
similarity index 87%
rename from doc/modules/ROOT/pages/7.examples/7h.custom-dynamic-buffer.adoc
rename to doc/modules/ROOT/pages/8.examples/8h.custom-dynamic-buffer.adoc
index 10bc0a0df..fd628da6a 100644
--- a/doc/modules/ROOT/pages/7.examples/7h.custom-dynamic-buffer.adoc
+++ b/doc/modules/ROOT/pages/8.examples/8h.custom-dynamic-buffer.adoc
@@ -10,7 +10,7 @@ Implementing the DynamicBuffer concept for a custom allocation strategy.
 
 == Prerequisites
 
-* Completed xref:7.examples/7g.parallel-fetch.adoc[Parallel Fetch]
+* Completed xref:8.examples/8g.parallel-fetch.adoc[Parallel Fetch]
 * Understanding of dynamic buffers from xref:../5.buffers/5f.dynamic.adoc[Dynamic Buffers]
 
 == Source Code
@@ -27,7 +27,7 @@ Implementing the DynamicBuffer concept for a custom allocation strategy.
 #include <cassert>
 #include <cstring>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 // Custom dynamic buffer with statistics tracking
 class tracked_buffer
@@ -52,9 +52,9 @@ public:
     // === DynamicBuffer interface ===
     
     // Consumer: readable data
-    const_buffer data() const noexcept
+    capy::const_buffer data() const noexcept
     {
-        return const_buffer(
+        return capy::const_buffer(
             storage_.data() + read_pos_,
             write_pos_ - read_pos_);
     }
@@ -76,7 +76,7 @@ public:
     }
     
     // Producer: prepare space for writing
-    mutable_buffer prepare(std::size_t n)
+    capy::mutable_buffer prepare(std::size_t n)
     {
         total_prepared_ += n;
         
@@ -94,7 +94,7 @@ public:
         if (required > storage_.size())
             storage_.resize(required);
         
-        return mutable_buffer(
+        return capy::mutable_buffer(
             storage_.data() + write_pos_,
             n);
     }
@@ -147,7 +147,7 @@ private:
 };
 
 // Demonstrate using the custom buffer
-task<> read_into_tracked_buffer(test::stream& stream, tracked_buffer& buffer)
+capy::task<> read_into_tracked_buffer(capy::test::stream& stream, tracked_buffer& buffer)
 {
     // Read data in chunks
     while (true)
@@ -155,17 +155,15 @@ task<> read_into_tracked_buffer(test::stream& stream, tracked_buffer& buffer)
         auto space = buffer.prepare(256);  // mutable_buffer
         // ec: std::error_code, n: std::size_t
         auto [ec, n] = co_await stream.read_some(space);
-        
-        if (ec == cond::eof)
-            break;
-        
-        if (ec)
-            throw std::system_error(ec);
-        
+
         buffer.commit(n);
-        
-        std::cout << "Read " << n << " bytes, buffer size now: " 
-                  << buffer.size() << "\n";
+
+        if (n > 0)
+            std::cout << "Read " << n << " bytes, buffer size now: "
+                      << buffer.size() << "\n";
+
+        if (ec)
+            break;
     }
 }
 
@@ -173,17 +171,15 @@ void demo_tracked_buffer()
 {
     std::cout << "=== Tracked Buffer Demo ===\n\n";
     
-    // Setup mock stream with test data
-    test::fuse f;
-    test::stream mock(f);
-    mock.provide("Hello, ");
-    mock.provide("World! ");
-    mock.provide("This is a test of the custom buffer.\n");
-    // Stream returns eof when data is exhausted
+    auto [reader, writer] = capy::test::make_stream_pair();
+    writer.provide("Hello, ");
+    writer.provide("World! ");
+    writer.provide("This is a test of the custom buffer.\n");
+    writer.close();
     
     tracked_buffer buffer;
     
-    test::run_blocking()(read_into_tracked_buffer(mock, buffer));
+    capy::test::run_blocking()(read_into_tracked_buffer(reader, buffer));
     
     std::cout << "\nFinal buffer contents: ";
     auto data = buffer.data();  // const_buffer
@@ -300,4 +296,4 @@ Buffer statistics:
 
 == Next Steps
 
-* xref:7.examples/7i.echo-server-corosio.adoc[Echo Server with Corosio] — Real networking
+* xref:8.examples/8i.echo-server-corosio.adoc[Echo Server with Corosio] — Real networking
diff --git a/doc/modules/ROOT/pages/8.examples/8i.echo-server-corosio.adoc b/doc/modules/ROOT/pages/8.examples/8i.echo-server-corosio.adoc
new file mode 100644
index 000000000..e4c073d27
--- /dev/null
+++ b/doc/modules/ROOT/pages/8.examples/8i.echo-server-corosio.adoc
@@ -0,0 +1,191 @@
+= Echo Server with Corosio
+
+A complete echo server using Corosio for real network I/O.
+
+== What You Will Learn
+
+* Integrating Capy with Corosio networking
+* Accepting TCP connections with `tcp_acceptor`
+* Handling multiple clients concurrently
+
+== Prerequisites
+
+* Completed xref:8.examples/8h.custom-dynamic-buffer.adoc[Custom Dynamic Buffer]
+* Corosio library installed
+* Understanding of TCP networking basics
+
+== Source Code
+
+[source,cpp]
+----
+#include <boost/capy.hpp>
+#include <boost/corosio.hpp>
+#include <iostream>
+
+namespace corosio = boost::corosio;
+namespace capy = boost::capy;
+
+capy::task<> echo_session(corosio::tcp_socket sock)
+{
+    char buf[1024];
+
+    for (;;)
+    {
+        auto [ec, n] = co_await sock.read_some(
+            capy::mutable_buffer(buf, sizeof(buf)));
+
+        auto [wec, wn] = co_await capy::write(
+            sock, capy::const_buffer(buf, n));
+
+        if (ec)
+            break;
+
+        if (wec)
+            break;
+    }
+
+    sock.close();
+}
+
+capy::task<> accept_loop(
+    corosio::tcp_acceptor& acc,
+    corosio::io_context& ioc)
+{
+    auto ep = acc.local_endpoint();
+    std::cout << "Listening on port " << ep.port() << "\n";
+
+    for (;;)
+    {
+        corosio::tcp_socket peer(ioc);
+        auto [ec] = co_await acc.accept(peer);
+
+        if (ec)
+        {
+            std::cout << "Accept error: " << ec.message() << "\n";
+            continue;
+        }
+
+        auto remote = peer.remote_endpoint();
+        std::cout << "Connection from ";
+        if (remote.is_v4())
+            std::cout << remote.v4_address();
+        else
+            std::cout << remote.v6_address();
+        std::cout << ":" << remote.port() << "\n";
+
+        capy::run_async(ioc.get_executor())(
+            echo_session(std::move(peer)));
+    }
+}
+
+int main(int argc, char* argv[])
+{
+    unsigned short port = 8080;
+    if (argc > 1)
+        port = static_cast<unsigned short>(std::atoi(argv[1]));
+
+    corosio::io_context ioc;
+    corosio::tcp_acceptor acc(ioc, corosio::endpoint(port));
+
+    capy::run_async(ioc.get_executor())(
+        accept_loop(acc, ioc));
+
+    ioc.run();
+
+    return 0;
+}
+----
+
+== Build
+
+[source,cmake]
+----
+add_executable(echo_server echo_server.cpp)
+target_link_libraries(echo_server PRIVATE Boost::capy Boost::corosio)
+----
+
+== Walkthrough
+
+=== TCP Acceptor
+
+[source,cpp]
+----
+corosio::io_context ioc;
+corosio::tcp_acceptor acc(ioc, corosio::endpoint(port));
+----
+
+The `io_context` drives all asynchronous I/O. The `tcp_acceptor` listens on the specified port. Corosio uses a flat namespace -- types like `tcp_socket`, `tcp_acceptor`, and `endpoint` live directly in `boost::corosio`.
+
+=== Accept Loop
+
+[source,cpp]
+----
+for (;;)
+{
+    corosio::tcp_socket peer(ioc);
+    auto [ec] = co_await acc.accept(peer);
+    // ... handle connection ...
+}
+----
+
+The accept loop runs forever, creating a new `tcp_socket` for each connection. `acc.accept(peer)` suspends the coroutine until a client connects.
+
+=== Echo Session
+
+[source,cpp]
+----
+auto [ec, n] = co_await sock.read_some(
+    capy::mutable_buffer(buf, sizeof(buf)));
+// ...
+auto [wec, wn] = co_await capy::write(
+    sock, capy::const_buffer(buf, n));
+----
+
+Each session reads data with `read_some` and writes it back with `write`. When the client disconnects, `read_some` returns an error and the loop exits.
+
+=== Concurrent Clients
+
+[source,cpp]
+----
+capy::run_async(ioc.get_executor())(
+    echo_session(std::move(peer)));
+----
+
+Each accepted connection moves the socket into a new task via `run_async`. The coroutine owns the socket for the lifetime of the session. Multiple clients are handled concurrently on the same `io_context`.
+
+== Testing
+
+Start the server:
+
+----
+$ ./echo_server 8080
+Listening on port 8080
+----
+
+Connect with netcat:
+
+----
+$ nc localhost 8080
+Hello
+Hello
+World
+World
+^C
+----
+
+Server output:
+
+----
+Listening on port 8080
+Connection from 127.0.0.1:54321
+----
+
+== Exercises
+
+1. Add a connection limit with graceful rejection
+2. Implement a simple command protocol (e.g., ECHO, QUIT, STATS)
+3. Add TLS support using Corosio's TLS streams
+
+== Next Steps
+
+* xref:8.examples/8j.stream-pipeline.adoc[Stream Pipeline] -- Data transformation chains
diff --git a/doc/modules/ROOT/pages/7.examples/7j.stream-pipeline.adoc b/doc/modules/ROOT/pages/8.examples/8j.stream-pipeline.adoc
similarity index 54%
rename from doc/modules/ROOT/pages/7.examples/7j.stream-pipeline.adoc
rename to doc/modules/ROOT/pages/8.examples/8j.stream-pipeline.adoc
index 6f7453c0b..6256f307e 100644
--- a/doc/modules/ROOT/pages/7.examples/7j.stream-pipeline.adoc
+++ b/doc/modules/ROOT/pages/8.examples/8j.stream-pipeline.adoc
@@ -10,25 +10,13 @@ Data transformation through a pipeline of sources and sinks.
 
 == Prerequisites
 
-* Completed xref:7.examples/7i.echo-server-corosio.adoc[Echo Server with Corosio]
+* Completed xref:8.examples/8i.echo-server-corosio.adoc[Echo Server with Corosio]
 * Understanding of buffer sources/sinks from xref:../6.streams/6d.buffer-concepts.adoc[Buffer Concepts]
 
 == Source Code
 
 [source,cpp]
 ----
-//
-// Stream Pipeline Example
-//
-// This example demonstrates chaining buffer sources to create a data
-// processing pipeline. Data flows through transform stages:
-//
-//   input -> uppercase_transform -> line_numbering_transform -> output
-//
-// Each transform is a BufferSource that wraps an upstream any_buffer_source,
-// enabling type-erased composition of arbitrary transform chains.
-//
-
 #include <boost/capy.hpp>
 #include <boost/capy/test/run_blocking.hpp>
 #include <boost/capy/test/buffer_source.hpp>
@@ -40,26 +28,36 @@ Data transformation through a pipeline of sources and sinks.
 #include <cctype>
 #include <system_error>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
+
+//------------------------------------------------------------------------------
+//
+// Transform: uppercase_transform
+//
+// A BufferSource that pulls from an upstream source and converts all
+// characters to uppercase. Demonstrates a simple byte-by-byte transform.
+//
+//------------------------------------------------------------------------------
 
-// A transform stage that converts to uppercase
 class uppercase_transform
 {
-    any_buffer_source* source_;  // any_buffer_source*
-    std::vector<char> buffer_;   // std::vector<char>
-    std::size_t consumed_ = 0;   // std::size_t
-    bool exhausted_ = false;     // bool
-    
+    capy::any_buffer_source* source_;  // any_buffer_source*
+    std::vector<char> buffer_;   // std::vector<char> - transformed data
+    std::size_t consumed_ = 0;   // std::size_t - bytes consumed by downstream
+    bool exhausted_ = false;     // bool - upstream exhausted
+
 public:
-    explicit uppercase_transform(any_buffer_source& source)
+    explicit uppercase_transform(capy::any_buffer_source& source)
         : source_(&source)
     {
     }
     
     // BufferSource::consume - advance past processed bytes
-    void consume(std::size_t n) noexcept
+    void
+    consume(std::size_t n) noexcept
     {
         consumed_ += n;
+        // Compact buffer when fully consumed
         if (consumed_ >= buffer_.size())
         {
             buffer_.clear();
@@ -68,86 +66,97 @@ public:
     }
     
     // BufferSource::pull - returns task<> to enable co_await on upstream
-    io_task<std::span<const_buffer>>
-    pull(std::span<const_buffer> dest)
+    capy::io_task<std::span<capy::const_buffer>>
+    pull(std::span<capy::const_buffer> dest)
     {
         // Already have unconsumed data?
         if (consumed_ < buffer_.size())
         {
             if (dest.empty())
-                co_return {std::error_code{}, std::span<const_buffer>{}};
-            
-            dest[0] = const_buffer(
+                co_return {std::error_code{}, std::span<capy::const_buffer>{}};
+
+            dest[0] = capy::const_buffer(
                 buffer_.data() + consumed_,
                 buffer_.size() - consumed_);
             co_return {std::error_code{}, dest.first(1)};
         }
-        
+
         // Upstream exhausted?
         if (exhausted_)
-            co_return {std::error_code{}, std::span<const_buffer>{}};
-        
+            co_return {capy::error::eof, std::span<capy::const_buffer>{}};
+
         // Pull from upstream
         buffer_.clear();
         consumed_ = 0;
-        
-        const_buffer upstream[8];  // const_buffer[8]
+
+        capy::const_buffer upstream[8];  // const_buffer[8]
         // ec: std::error_code, bufs: std::span<const_buffer>
         auto [ec, bufs] = co_await source_->pull(upstream);
-        
-        if (ec)
-            co_return {ec, std::span<const_buffer>{}};
-        
-        if (bufs.empty())
+
+        if (ec == capy::cond::eof)
         {
             exhausted_ = true;
-            co_return {std::error_code{}, std::span<const_buffer>{}};
+            co_return {capy::error::eof, std::span<capy::const_buffer>{}};
         }
-        
+
+        if (ec)
+            co_return {ec, std::span<capy::const_buffer>{}};
+
         // Transform: uppercase each byte
         for (auto const& buf : bufs)  // const_buffer const&
         {
             auto const* data = static_cast<char const*>(buf.data());  // char const*
             auto size = buf.size();  // std::size_t
-            
+
             for (std::size_t i = 0; i < size; ++i)
             {
                 buffer_.push_back(static_cast<char>(
                     std::toupper(static_cast<unsigned char>(data[i]))));
             }
         }
-        
+
         // Consume from upstream
-        source_->consume(buffer_size(bufs));
-        
+        source_->consume(capy::buffer_size(bufs));
+
         // Return transformed data
         if (dest.empty() || buffer_.empty())
-            co_return {std::error_code{}, std::span<const_buffer>{}};
-        
-        dest[0] = const_buffer(buffer_.data(), buffer_.size());
+            co_return {std::error_code{}, std::span<capy::const_buffer>{}};
+
+        dest[0] = capy::const_buffer(buffer_.data(), buffer_.size());
         co_return {std::error_code{}, dest.first(1)};
     }
 };
 
-// A transform that adds line numbers
+//------------------------------------------------------------------------------
+//
+// Transform: line_numbering_transform
+//
+// A BufferSource that pulls from an upstream source and prepends line
+// numbers to each line. Demonstrates a transform that changes data size.
+//
+//------------------------------------------------------------------------------
+
 class line_numbering_transform
 {
-    any_buffer_source* source_;  // any_buffer_source*
-    std::string buffer_;         // std::string
-    std::size_t consumed_ = 0;   // std::size_t
-    std::size_t line_num_ = 1;   // std::size_t
-    bool at_line_start_ = true;  // bool
-    bool exhausted_ = false;     // bool
-    
+    capy::any_buffer_source* source_;  // any_buffer_source*
+    std::string buffer_;         // std::string - transformed data
+    std::size_t consumed_ = 0;   // std::size_t - bytes consumed by downstream
+    std::size_t line_num_ = 1;   // std::size_t - current line number
+    bool at_line_start_ = true;  // bool - are we at start of a line?
+    bool exhausted_ = false;     // bool - upstream exhausted
+
 public:
-    explicit line_numbering_transform(any_buffer_source& source)
+    explicit line_numbering_transform(capy::any_buffer_source& source)
         : source_(&source)
     {
     }
     
-    void consume(std::size_t n) noexcept
+    // BufferSource::consume - advance past processed bytes
+    void
+    consume(std::size_t n) noexcept
     {
         consumed_ += n;
+        // Compact buffer when fully consumed
         if (consumed_ >= buffer_.size())
         {
             buffer_.clear();
@@ -155,45 +164,49 @@ public:
         }
     }
     
-    io_task<std::span<const_buffer>>
-    pull(std::span<const_buffer> dest)
+    // BufferSource::pull - returns task<> to enable co_await on upstream
+    capy::io_task<std::span<capy::const_buffer>>
+    pull(std::span<capy::const_buffer> dest)
     {
+        // Already have unconsumed data?
         if (consumed_ < buffer_.size())
         {
             if (dest.empty())
-                co_return {std::error_code{}, std::span<const_buffer>{}};
-            
-            dest[0] = const_buffer(
+                co_return {std::error_code{}, std::span<capy::const_buffer>{}};
+
+            dest[0] = capy::const_buffer(
                 buffer_.data() + consumed_,
                 buffer_.size() - consumed_);
             co_return {std::error_code{}, dest.first(1)};
         }
-        
+
+        // Upstream exhausted?
         if (exhausted_)
-            co_return {std::error_code{}, std::span<const_buffer>{}};
-        
+            co_return {capy::error::eof, std::span<capy::const_buffer>{}};
+
+        // Pull from upstream
         buffer_.clear();
         consumed_ = 0;
-        
-        const_buffer upstream[8];  // const_buffer[8]
+
+        capy::const_buffer upstream[8];  // const_buffer[8]
         // ec: std::error_code, bufs: std::span<const_buffer>
         auto [ec, bufs] = co_await source_->pull(upstream);
-        
-        if (ec)
-            co_return {ec, std::span<const_buffer>{}};
-        
-        if (bufs.empty())
+
+        if (ec == capy::cond::eof)
         {
             exhausted_ = true;
-            co_return {std::error_code{}, std::span<const_buffer>{}};
+            co_return {capy::error::eof, std::span<capy::const_buffer>{}};
         }
-        
+
+        if (ec)
+            co_return {ec, std::span<capy::const_buffer>{}};
+
         // Transform: add line numbers
         for (auto const& buf : bufs)  // const_buffer const&
         {
             auto const* data = static_cast<char const*>(buf.data());  // char const*
             auto size = buf.size();  // std::size_t
-            
+
             for (std::size_t i = 0; i < size; ++i)
             {
                 if (at_line_start_)
@@ -206,34 +219,42 @@ public:
                     at_line_start_ = true;
             }
         }
-        
-        source_->consume(buffer_size(bufs));
-        
+
+        // Consume from upstream
+        source_->consume(capy::buffer_size(bufs));
+
+        // Return transformed data
         if (dest.empty() || buffer_.empty())
-            co_return {std::error_code{}, std::span<const_buffer>{}};
-        
-        dest[0] = const_buffer(buffer_.data(), buffer_.size());
+            co_return {std::error_code{}, std::span<capy::const_buffer>{}};
+
+        dest[0] = capy::const_buffer(buffer_.data(), buffer_.size());
         co_return {std::error_code{}, dest.first(1)};
     }
 };
 
-// Transfer from source to sink
-task<std::size_t> transfer(any_buffer_source& source, any_write_sink& sink)
+//------------------------------------------------------------------------------
+//
+// transfer: Pull from source and write to sink until exhausted
+//
+//------------------------------------------------------------------------------
+
+capy::task<std::size_t> transfer(capy::any_buffer_source& source, capy::any_write_sink& sink)
 {
     std::size_t total = 0;  // std::size_t
-    const_buffer bufs[8];   // const_buffer[8]
-    
+    capy::const_buffer bufs[8];   // const_buffer[8]
+
     for (;;)
     {
         // ec: std::error_code, spans: std::span<const_buffer>
         auto [ec, spans] = co_await source.pull(bufs);
-        
+
+        if (ec == capy::cond::eof)
+            break;
+
         if (ec)
             throw std::system_error(ec);
-        
-        if (spans.empty())
-            break;
-        
+
+        // Write each buffer to sink
         for (auto const& buf : spans)  // const_buffer const&
         {
             // wec: std::error_code, n: std::size_t
@@ -242,52 +263,72 @@ task<std::size_t> transfer(any_buffer_source& source, any_write_sink& sink)
                 throw std::system_error(wec);
             total += n;
         }
-        
-        source.consume(buffer_size(spans));
+
+        // Consume what we read
+        source.consume(capy::buffer_size(spans));
     }
-    
-    io_result<> eof_result = co_await sink.write_eof();
+
+    capy::io_result<> eof_result = co_await sink.write_eof();
     if (eof_result.ec)
         throw std::system_error(eof_result.ec);
-    
+
     co_return total;
 }
 
+//------------------------------------------------------------------------------
+//
+// demo_pipeline: Demonstrate chained transforms
+//
+//------------------------------------------------------------------------------
+
 void demo_pipeline()
 {
     std::cout << "=== Stream Pipeline Demo ===\n\n";
     
-    // Input data
+    // Input data - three lines
     std::string input = "hello world\nthis is a test\nof the pipeline\n";
     std::cout << "Input:\n" << input << "\n";
     
     // Create mock source with input data
-    test::fuse f;  // test::fuse
-    test::buffer_source source(f);  // test::buffer_source
+    capy::test::fuse f;  // test::fuse
+    capy::test::buffer_source source(f);  // test::buffer_source
     source.provide(input);
     
-    // Build the pipeline using type-erased buffer sources.
-    // Using pointer construction (&source) for reference semantics -
-    // the wrapper does not take ownership, so source must outlive src.
-    any_buffer_source src{&source};  // any_buffer_source
+    // Build the pipeline using type-erased buffer sources:
+    //   source -> [uppercase] -> [line_numbering] -> sink
+    
+    // Stage 1: Wrap raw source as any_buffer_source.
+    // Using pointer construction (&source) for reference semantics - the
+    // wrapper does not take ownership, so source must outlive src.
+    capy::any_buffer_source src{&source};  // any_buffer_source
     
+    // Stage 2: Uppercase transform wraps src.
+    // Again using pointer construction so upper_src references upper
+    // without taking ownership.
     uppercase_transform upper{src};  // uppercase_transform
-    any_buffer_source upper_src{&upper};  // any_buffer_source
+    capy::any_buffer_source upper_src{&upper};  // any_buffer_source
     
+    // Stage 3: Line numbering transform wraps upper_src.
     line_numbering_transform numbered{upper_src};  // line_numbering_transform
-    any_buffer_source numbered_src{&numbered};  // any_buffer_source
+    capy::any_buffer_source numbered_src{&numbered};  // any_buffer_source
     
-    // Create sink - pointer construction ensures sink outlives dst
-    test::write_sink sink(f);  // test::write_sink
-    any_write_sink dst{&sink};  // any_write_sink
+    // Create sink to collect output.
+    // Pointer construction ensures sink outlives dst.
+    capy::test::write_sink sink(f);  // test::write_sink
+    capy::any_write_sink dst{&sink};  // any_write_sink
     
-    // Run pipeline
+    // Run the pipeline
     std::size_t bytes = 0;  // std::size_t
-    test::run_blocking([&](std::size_t n) { bytes = n; })(
+    capy::test::run_blocking([&](std::size_t n) { bytes = n; })(
         transfer(numbered_src, dst));
     
     std::cout << "Output (" << bytes << " bytes):\n";
     std::cout << sink.data() << "\n";
+    
+    // Expected output:
+    // 1: HELLO WORLD
+    // 2: THIS IS A TEST
+    // 3: OF THE PIPELINE
 }
 
 int main()
@@ -332,20 +373,20 @@ Data flows through the pipeline:
 
 [source,cpp]
 ----
-io_task<std::span<const_buffer>>
-pull(std::span<const_buffer> dest)
+capy::io_task<std::span<capy::const_buffer>>
+pull(std::span<capy::const_buffer> dest)
 {
     // Pull from upstream
     // ec: std::error_code, bufs: std::span<const_buffer>
     auto [ec, bufs] = co_await source_->pull(upstream);
-    
+
     // Transform data...
-    
+
     // Consume from upstream
-    source_->consume(buffer_size(bufs));
-    
+    source_->consume(capy::buffer_size(bufs));
+
     // Return transformed buffer
-    dest[0] = const_buffer(buffer_.data(), buffer_.size());
+    dest[0] = capy::const_buffer(buffer_.data(), buffer_.size());
     co_return {std::error_code{}, dest.first(1)};
 }
 ----
@@ -362,10 +403,10 @@ Each stage:
 [source,cpp]
 ----
 // Using pointer construction (&source) for reference semantics
-any_buffer_source src{&source};  // any_buffer_source
+capy::any_buffer_source src{&source};  // any_buffer_source
 
 uppercase_transform upper{src};  // uppercase_transform
-any_buffer_source upper_src{&upper};  // any_buffer_source
+capy::any_buffer_source upper_src{&upper};  // any_buffer_source
 ----
 
 `any_buffer_source` wraps each stage using pointer construction, allowing uniform composition while preserving the lifetime of the underlying objects.
@@ -392,6 +433,10 @@ Output (52 bytes):
 2. Implement a ROT13 transform
 3. Create a filtering stage that drops lines matching a pattern
 
+== Next Steps
+
+* xref:8.examples/8k.strand-serialization.adoc[Strand Serialization] -- Lock-free shared state with strands
+
 == Summary
 
 This example catalog demonstrated:
@@ -406,5 +451,8 @@ This example catalog demonstrated:
 * Custom buffer implementations
 * Real network I/O with Corosio
 * Data transformation pipelines
+* Strand-based serialization and async mutexes
+* Parallel task distribution across thread pools
+* Custom executor implementations
 
 These patterns form the foundation for building robust, efficient I/O applications with Capy.
diff --git a/doc/modules/ROOT/pages/8.examples/8k.strand-serialization.adoc b/doc/modules/ROOT/pages/8.examples/8k.strand-serialization.adoc
new file mode 100644
index 000000000..d7c66fc85
--- /dev/null
+++ b/doc/modules/ROOT/pages/8.examples/8k.strand-serialization.adoc
@@ -0,0 +1,146 @@
+= Strand Serialization
+
+Protecting shared state with a strand instead of a mutex.
+
+== What You Will Learn
+
+* Using a `strand` to serialize coroutine access to shared state
+* Lock-free shared state management
+* Combining `when_all` with strand-based serialization
+
+== Prerequisites
+
+* Completed xref:8.examples/8b.producer-consumer.adoc[Producer-Consumer] (introduces `strand`)
+* Understanding of `when_all` from xref:../4.coroutines/4f.composition.adoc[Composition]
+
+== Source Code
+
+[source,cpp]
+----
+#include <boost/capy.hpp>
+#include <iostream>
+#include <latch>
+#include <vector>
+
+namespace capy = boost::capy;
+
+int main()
+{
+    constexpr int num_coroutines = 10;
+    constexpr int increments_per_coro = 1000;
+
+    capy::thread_pool pool(4);
+    capy::strand s{pool.get_executor()};
+    std::latch done(1);
+
+    auto on_complete = [&done](auto&&...) { done.count_down(); };
+    auto on_error = [&done](std::exception_ptr ep) {
+        try { std::rethrow_exception(ep); }
+        catch (std::exception const& e) {
+            std::cerr << "Error: " << e.what() << "\n";
+        }
+        catch (...) {
+            std::cerr << "Error: unknown exception\n";
+        }
+        done.count_down();
+    };
+
+    int counter = 0;
+
+    // Each coroutine increments the shared counter without locks.
+    // The strand ensures only one coroutine runs at a time.
+    auto increment = [&](int id) -> capy::io_task<> {
+        for (int i = 0; i < increments_per_coro; ++i)
+            ++counter;
+        std::cout << "Coroutine " << id
+                  << " finished, counter = " << counter << "\n";
+        co_return capy::io_result<>{};
+    };
+
+    auto run_all = [&]() -> capy::task<> {
+        std::vector<capy::io_task<>> tasks;
+        for (int i = 0; i < num_coroutines; ++i)
+            tasks.push_back(increment(i));
+        (void) co_await capy::when_all(std::move(tasks));
+    };
+
+    capy::run_async(s, on_complete, on_error)(run_all());
+    done.wait();
+
+    int expected = num_coroutines * increments_per_coro;
+    std::cout << "\nFinal counter: " << counter
+              << " (expected " << expected << ")\n";
+
+    return 0;
+}
+----
+
+== Build
+
+[source,cmake]
+----
+add_executable(strand_serialization strand_serialization.cpp)
+target_link_libraries(strand_serialization PRIVATE Boost::capy)
+----
+
+== Walkthrough
+
+=== Strand as Serializer
+
+[source,cpp]
+----
+capy::strand s{pool.get_executor()};
+----
+
+A `strand` wraps an executor and guarantees that handlers dispatched through it never run concurrently. This replaces the need for a mutex when protecting shared state accessed by coroutines.
+
+=== Lock-Free Shared Access
+
+[source,cpp]
+----
+int counter = 0;
+
+auto increment = [&](int id) -> capy::io_task<> {
+    for (int i = 0; i < increments_per_coro; ++i)
+        ++counter;
+    // ...
+};
+----
+
+Multiple coroutines increment the same `counter` without any locks. The strand serializes execution so only one coroutine runs at a time, preventing data races.
+
+=== Running on the Strand
+
+[source,cpp]
+----
+capy::run_async(s, on_complete, on_error)(run_all());
+----
+
+Passing the strand `s` to `run_async` ensures the entire coroutine tree executes through the strand. Even though the underlying `thread_pool` has 4 threads, the strand constrains execution to one coroutine at a time.
+
+== Output
+
+----
+Coroutine 0 finished, counter = 1000
+Coroutine 1 finished, counter = 2000
+Coroutine 2 finished, counter = 3000
+Coroutine 3 finished, counter = 4000
+Coroutine 4 finished, counter = 5000
+Coroutine 5 finished, counter = 6000
+Coroutine 6 finished, counter = 7000
+Coroutine 7 finished, counter = 8000
+Coroutine 8 finished, counter = 9000
+Coroutine 9 finished, counter = 10000
+
+Final counter: 10000 (expected 10000)
+----
+
+== Exercises
+
+1. Remove the strand and run directly on the pool executor -- observe the data race
+2. Replace the plain `int` counter with `std::atomic<int>` and compare the two approaches
+3. Add a second shared variable and verify both are protected by the strand
+
+== Next Steps
+
+* xref:8.examples/8l.async-mutex.adoc[Async Mutex] -- FIFO coroutine locking
diff --git a/doc/modules/ROOT/pages/8.examples/8l.async-mutex.adoc b/doc/modules/ROOT/pages/8.examples/8l.async-mutex.adoc
new file mode 100644
index 000000000..dfcac7d90
--- /dev/null
+++ b/doc/modules/ROOT/pages/8.examples/8l.async-mutex.adoc
@@ -0,0 +1,168 @@
+= Async Mutex
+
+Fair FIFO coroutine locking with `async_mutex`.
+
+== What You Will Learn
+
+* Using `async_mutex` for mutual exclusion between coroutines
+* RAII lock guards with `scoped_lock`
+* FIFO fairness guarantees
+* Comparing `async_mutex` to strand-based serialization
+
+== Prerequisites
+
+* Completed xref:8.examples/8k.strand-serialization.adoc[Strand Serialization]
+
+== Source Code
+
+[source,cpp]
+----
+#include <boost/capy.hpp>
+#include <iostream>
+#include <latch>
+#include <vector>
+
+namespace capy = boost::capy;
+
+int main()
+{
+    capy::thread_pool pool;
+    capy::strand s{pool.get_executor()};
+    std::latch done(1);
+
+    auto on_complete = [&done](auto&&...) { done.count_down(); };
+    auto on_error = [&done](std::exception_ptr ep) {
+        try { std::rethrow_exception(ep); }
+        catch (std::exception const& e) {
+            std::cerr << "Error: " << e.what() << "\n";
+        }
+        catch (...) {
+            std::cerr << "Error: unknown exception\n";
+        }
+        done.count_down();
+    };
+
+    capy::async_mutex mtx;
+    int acquisition_order = 0;
+    std::vector<int> order_log;
+
+    auto worker = [&](int id) -> capy::io_task<> {
+        std::cout << "Worker " << id << " waiting for lock\n";
+        auto [ec, guard] = co_await mtx.scoped_lock();
+        if (ec)
+        {
+            std::cout << "Worker " << id
+                      << " canceled: " << ec.message() << "\n";
+            co_return capy::io_result<>{ec};
+        }
+
+        int seq = acquisition_order++;
+        order_log.push_back(id);
+        std::cout << "Worker " << id
+                  << " acquired lock (sequence " << seq << ")\n";
+
+        std::cout << "Worker " << id << " releasing lock\n";
+        co_return capy::io_result<>{};
+    };
+
+    auto run_all = [&]() -> capy::task<> {
+        auto r = co_await capy::when_all(
+            worker(0), worker(1), worker(2),
+            worker(3), worker(4), worker(5));
+        if(r.ec)
+            std::cerr << "when_all error: "
+                      << r.ec.message() << "\n";
+    };
+
+    // Run on a strand so async_mutex operations are single-threaded
+    capy::run_async(s, on_complete, on_error)(run_all());
+    done.wait();
+
+    std::cout << "\nAcquisition order: ";
+    for (std::size_t i = 0; i < order_log.size(); ++i)
+    {
+        if (i > 0)
+            std::cout << " -> ";
+        std::cout << "W" << order_log[i];
+    }
+    std::cout << "\n";
+
+    return 0;
+}
+----
+
+== Build
+
+[source,cmake]
+----
+add_executable(async_mutex async_mutex.cpp)
+target_link_libraries(async_mutex PRIVATE Boost::capy)
+----
+
+== Walkthrough
+
+=== Creating the Mutex
+
+[source,cpp]
+----
+capy::async_mutex mtx;
+----
+
+`async_mutex` is a coroutine-aware mutex. Unlike `std::mutex`, it suspends the calling coroutine instead of blocking the thread, allowing other coroutines to run while waiting for the lock.
+
+=== Scoped Lock
+
+[source,cpp]
+----
+auto [ec, guard] = co_await mtx.scoped_lock();
+if (ec)
+{
+    // Lock was canceled
+    co_return capy::io_result<>{ec};
+}
+----
+
+`scoped_lock()` returns an `io_result` with an error code and an RAII guard. The guard automatically releases the lock when it goes out of scope. If the operation is canceled (e.g., via a stop token), `ec` will be set.
+
+=== FIFO Fairness
+
+Workers acquire the lock in the order they request it. Unlike `std::mutex`, which has no fairness guarantees, `async_mutex` ensures FIFO ordering -- the first coroutine to call `scoped_lock()` is the first to acquire it.
+
+=== Strand vs Async Mutex
+
+The strand serialization example showed how a strand can protect shared state by running all coroutines sequentially. `async_mutex` provides finer-grained control: coroutines run concurrently and only serialize when entering the critical section.
+
+== Output
+
+----
+Worker 0 waiting for lock
+Worker 0 acquired lock (sequence 0)
+Worker 0 releasing lock
+Worker 1 waiting for lock
+Worker 1 acquired lock (sequence 1)
+Worker 1 releasing lock
+Worker 2 waiting for lock
+Worker 2 acquired lock (sequence 2)
+Worker 2 releasing lock
+Worker 3 waiting for lock
+Worker 3 acquired lock (sequence 3)
+Worker 3 releasing lock
+Worker 4 waiting for lock
+Worker 4 acquired lock (sequence 4)
+Worker 4 releasing lock
+Worker 5 waiting for lock
+Worker 5 acquired lock (sequence 5)
+Worker 5 releasing lock
+
+Acquisition order: W0 -> W1 -> W2 -> W3 -> W4 -> W5
+----
+
+== Exercises
+
+1. Add work outside the critical section (before and after `scoped_lock`) to observe concurrent execution
+2. Use a stop token to cancel waiting workers after a timeout
+3. Replace `async_mutex` with a strand and compare the two approaches
+
+== Next Steps
+
+* xref:8.examples/8m.parallel-tasks.adoc[Parallel Tasks] -- Distributing work across a thread pool
diff --git a/doc/modules/ROOT/pages/8.examples/8m.parallel-tasks.adoc b/doc/modules/ROOT/pages/8.examples/8m.parallel-tasks.adoc
new file mode 100644
index 000000000..facdd194e
--- /dev/null
+++ b/doc/modules/ROOT/pages/8.examples/8m.parallel-tasks.adoc
@@ -0,0 +1,162 @@
+= Parallel Tasks
+
+Distributing CPU-bound work across a thread pool and collecting results.
+
+== What You Will Learn
+
+* Running tasks in parallel on a `thread_pool`
+* Collecting results with `when_all` structured bindings
+* Observing thread IDs to verify parallel execution
+
+== Prerequisites
+
+* Completed xref:8.examples/8g.parallel-fetch.adoc[Parallel Fetch] (introduces `when_all`)
+
+== Source Code
+
+[source,cpp]
+----
+#include <boost/capy.hpp>
+#include <iostream>
+#include <latch>
+#include <sstream>
+#include <thread>
+
+namespace capy = boost::capy;
+
+// Sum integers in [lo, hi)
+capy::io_task<long long> partial_sum(int lo, int hi)
+{
+    std::ostringstream oss;
+    oss << "  range [" << lo << ", " << hi
+        << ") on thread " << std::this_thread::get_id() << "\n";
+    std::cout << oss.str();
+
+    long long sum = 0;
+    for (int i = lo; i < hi; ++i)
+        sum += i;
+    co_return capy::io_result<long long>{{}, sum};
+}
+
+int main()
+{
+    constexpr int total = 10000;
+    constexpr int num_tasks = 4;
+    constexpr int chunk = total / num_tasks;
+
+    capy::thread_pool pool(num_tasks);
+    std::latch done(1);
+
+    auto on_complete = [&done](auto&&...) { done.count_down(); };
+    auto on_error = [&done](std::exception_ptr ep) {
+        try { std::rethrow_exception(ep); }
+        catch (std::exception const& e) {
+            std::cerr << "Error: " << e.what() << "\n";
+        }
+        catch (...) {
+            std::cerr << "Error: unknown exception\n";
+        }
+        done.count_down();
+    };
+
+    auto compute = [&]() -> capy::task<> {
+        std::cout << "Dispatching " << num_tasks
+                  << " parallel tasks...\n";
+
+        auto [ec, s0, s1, s2, s3] = co_await capy::when_all(
+            partial_sum(0 * chunk, 1 * chunk),
+            partial_sum(1 * chunk, 2 * chunk),
+            partial_sum(2 * chunk, 3 * chunk),
+            partial_sum(3 * chunk, 4 * chunk));
+
+        long long total_sum = s0 + s1 + s2 + s3;
+
+        // Arithmetic series: sum [0, N) = N*(N-1)/2
+        long long expected =
+            static_cast<long long>(total) * (total - 1) / 2;
+
+        std::cout << "\nPartial sums: " << s0 << " + " << s1
+                  << " + " << s2 << " + " << s3 << "\n";
+        std::cout << "Total: " << total_sum
+                  << " (expected " << expected << ")\n";
+    };
+
+    capy::run_async(pool.get_executor(), on_complete, on_error)(compute());
+    done.wait();
+
+    return 0;
+}
+----
+
+== Build
+
+[source,cmake]
+----
+add_executable(parallel_tasks parallel_tasks.cpp)
+target_link_libraries(parallel_tasks PRIVATE Boost::capy)
+----
+
+== Walkthrough
+
+=== Partitioning Work
+
+[source,cpp]
+----
+constexpr int total = 10000;
+constexpr int num_tasks = 4;
+constexpr int chunk = total / num_tasks;
+----
+
+The range `[0, 10000)` is divided into 4 equal chunks, one per task. Each task computes a partial sum independently.
+
+=== Parallel Execution with when_all
+
+[source,cpp]
+----
+auto [ec, s0, s1, s2, s3] = co_await capy::when_all(
+    partial_sum(0 * chunk, 1 * chunk),
+    partial_sum(1 * chunk, 2 * chunk),
+    partial_sum(2 * chunk, 3 * chunk),
+    partial_sum(3 * chunk, 4 * chunk));
+----
+
+`when_all` launches all four tasks concurrently on the thread pool. Each task may run on a different thread. The result is `io_result<long long, long long, long long, long long>`, a single `ec` plus the four partial sums in input order.
+
+=== Observing Thread IDs
+
+[source,cpp]
+----
+std::ostringstream oss;
+oss << "  range [" << lo << ", " << hi
+    << ") on thread " << std::this_thread::get_id() << "\n";
+std::cout << oss.str();
+----
+
+Each task prints its thread ID. On a multi-core system you will see different thread IDs, confirming true parallel execution. The `ostringstream` ensures each line is printed atomically.
+
+=== Verifying Results
+
+The sum of `[0, N)` is `N*(N-1)/2`. The example verifies that the sum of partial results matches this formula.
+
+== Output
+
+----
+Dispatching 4 parallel tasks...
+  range [0, 2500) on thread 140234567890432
+  range [2500, 5000) on thread 140234567886336
+  range [5000, 7500) on thread 140234567882240
+  range [7500, 10000) on thread 140234567878144
+
+Partial sums: 3123750 + 9373750 + 15623750 + 21873750
+Total: 49995000 (expected 49995000)
+----
+
+== Exercises
+
+1. Increase `num_tasks` beyond the pool thread count and observe how tasks are scheduled
+2. Add a timing comparison between parallel execution and a single-threaded loop
+3. Generalize the partitioning to handle ranges that don't divide evenly
+
+== Next Steps
+
+* xref:8.examples/8n.custom-executor.adoc[Custom Executor] -- Building your own execution context
diff --git a/doc/modules/ROOT/pages/8.examples/8n.custom-executor.adoc b/doc/modules/ROOT/pages/8.examples/8n.custom-executor.adoc
new file mode 100644
index 000000000..2c9183118
--- /dev/null
+++ b/doc/modules/ROOT/pages/8.examples/8n.custom-executor.adoc
@@ -0,0 +1,254 @@
+= Custom Executor
+
+Implementing the Executor concept with a single-threaded run loop.
+
+== What You Will Learn
+
+* Satisfying the `Executor` concept
+* Implementing `execution_context`, `dispatch`, and `post`
+* Running Capy coroutines on a custom scheduling system
+
+== Prerequisites
+
+* Understanding of executors from xref:../4.coroutines/4c.executors.adoc[Executors and Execution Contexts]
+
+== Source Code
+
+[source,cpp]
+----
+#include <boost/capy.hpp>
+#include <boost/capy/ex/frame_allocator.hpp>
+#include <iostream>
+#include <queue>
+#include <thread>
+
+namespace capy = boost::capy;
+
+// A minimal single-threaded execution context.
+// Demonstrates how to satisfy the Executor concept
+// for any custom scheduling system.
+class run_loop : public capy::execution_context
+{
+    std::queue<std::coroutine_handle<>> queue_;
+    std::thread::id owner_;
+
+public:
+    class executor_type;
+
+    run_loop()
+        : execution_context(this)
+    {
+    }
+
+    ~run_loop()
+    {
+        shutdown();
+        destroy();
+    }
+
+    run_loop(run_loop const&) = delete;
+    run_loop& operator=(run_loop const&) = delete;
+
+    // Drain the queue until empty
+    void run()
+    {
+        owner_ = std::this_thread::get_id();
+        while (!queue_.empty())
+        {
+            auto h = queue_.front();
+            queue_.pop();
+            capy::safe_resume(h);
+        }
+    }
+
+    void enqueue(std::coroutine_handle<> h)
+    {
+        queue_.push(h);
+    }
+
+    bool is_running_on_this_thread() const noexcept
+    {
+        return std::this_thread::get_id() == owner_;
+    }
+
+    executor_type get_executor() noexcept;
+};
+
+class run_loop::executor_type
+{
+    friend class run_loop;
+    run_loop* loop_ = nullptr;
+
+    explicit executor_type(run_loop& loop) noexcept
+        : loop_(&loop)
+    {
+    }
+
+public:
+    executor_type() = default;
+
+    capy::execution_context& context() const noexcept
+    {
+        return *loop_;
+    }
+
+    void on_work_started() const noexcept {}
+    void on_work_finished() const noexcept {}
+
+    std::coroutine_handle<> dispatch(
+        capy::continuation& c) const
+    {
+        if (loop_->is_running_on_this_thread())
+            return c.h;
+        loop_->enqueue(c.h);
+        return std::noop_coroutine();
+    }
+
+    void post(capy::continuation& c) const
+    {
+        loop_->enqueue(c.h);
+    }
+
+    bool operator==(executor_type const& other) const noexcept
+    {
+        return loop_ == other.loop_;
+    }
+};
+
+inline
+run_loop::executor_type
+run_loop::get_executor() noexcept
+{
+    return executor_type{*this};
+}
+
+// Verify the concept is satisfied
+static_assert(capy::Executor<run_loop::executor_type>);
+
+capy::io_task<int> compute(int x)
+{
+    std::cout << "  computing " << x << " * " << x << "\n";
+    co_return capy::io_result<int>{{}, x * x};
+}
+
+capy::task<> run_tasks()
+{
+    std::cout << "Launching 3 tasks with when_all...\n";
+
+    auto [ec, r1, r2, r3] = co_await capy::when_all(
+        compute(3), compute(7), compute(11));
+
+    std::cout << "\nResults: " << r1 << ", " << r2
+              << ", " << r3 << "\n";
+    std::cout << "Sum of squares: "
+              << r1 + r2 + r3 << "\n";
+}
+
+int main()
+{
+    run_loop loop;
+
+    // Launch using run_async, just like with thread_pool
+    capy::run_async(loop.get_executor())(run_tasks());
+
+    // Drive the loop — all coroutines execute here
+    std::cout << "Running event loop on main thread...\n";
+    loop.run();
+
+    std::cout << "Event loop finished.\n";
+    return 0;
+}
+----
+
+== Build
+
+[source,cmake]
+----
+add_executable(custom_executor custom_executor.cpp)
+target_link_libraries(custom_executor PRIVATE Boost::capy)
+----
+
+== Walkthrough
+
+=== Inheriting execution_context
+
+[source,cpp]
+----
+class run_loop : public capy::execution_context
+{
+    // ...
+    run_loop()
+        : execution_context(this)
+    {
+    }
+};
+----
+
+Custom execution contexts inherit from `execution_context` and pass `this` to the base constructor. The destructor must call `shutdown()` then `destroy()` to clean up coroutine state.
+
+=== The Executor Concept
+
+The nested `executor_type` must provide:
+
+* `context()` -- returns a reference to the owning `execution_context`
+* `on_work_started()` / `on_work_finished()` -- work-tracking hooks
+* `dispatch(c)` -- resume immediately if already on this context, otherwise enqueue. Takes a `continuation&` and returns `std::coroutine_handle<>`.
+* `post(c)` -- always enqueue for later execution. Takes a `continuation&`.
+* `operator==` -- compare two executors for identity
+
+[source,cpp]
+----
+static_assert(capy::Executor<run_loop::executor_type>);
+----
+
+The `static_assert` verifies at compile time that all concept requirements are met.
+
+=== Dispatch vs Post
+
+[source,cpp]
+----
+std::coroutine_handle<> dispatch(
+    capy::continuation& c) const
+{
+    if (loop_->is_running_on_this_thread())
+        return c.h;        // resume inline
+    loop_->enqueue(c.h);
+    return std::noop_coroutine();  // defer
+}
+----
+
+`dispatch` takes a `continuation&` and checks whether the caller is already running on the loop's thread. If so, it returns `c.h` directly for inline resumption via symmetric transfer. Otherwise it enqueues `c.h` and returns `noop_coroutine` so the caller continues without blocking.
+
+`post` always enqueues, even if already on the right thread.
+
+=== Driving the Loop
+
+[source,cpp]
+----
+capy::run_async(loop.get_executor())(run_tasks());
+loop.run();
+----
+
+`run_async` enqueues the initial coroutine. `loop.run()` drains the queue, resuming coroutines one by one until all work completes. This is analogous to a GUI event loop or game tick loop.
+
+Note that `run()` uses `capy::safe_resume(h)` instead of `h.resume()`. This saves and restores the thread-local frame allocator around each resumption, preventing coroutines from spoiling each other's allocator. All custom executor event loops must use `safe_resume` -- see xref:../4.coroutines/4g.allocators.adoc#_tls_preservation[TLS Preservation] for details.
+
+== Output
+
+----
+Running event loop on main thread...
+Launching 3 tasks with when_all...
+  computing 3 * 3
+  computing 7 * 7
+  computing 11 * 11
+
+Results: 9, 49, 121
+Sum of squares: 179
+Event loop finished.
+----
+
+== Exercises
+
+1. Add a `stop()` method that causes `run()` to exit early, even with work remaining
+2. Make the run loop thread-safe so work can be posted from other threads
+3. Integrate the run loop with a platform event system (e.g., `epoll`, `kqueue`, or a GUI framework)
diff --git a/doc/modules/ROOT/pages/8.design/8.intro.adoc b/doc/modules/ROOT/pages/9.design/9.intro.adoc
similarity index 100%
rename from doc/modules/ROOT/pages/8.design/8.intro.adoc
rename to doc/modules/ROOT/pages/9.design/9.intro.adoc
diff --git a/doc/modules/ROOT/pages/8.design/8a.CapyLayering.adoc b/doc/modules/ROOT/pages/9.design/9a.CapyLayering.adoc
similarity index 99%
rename from doc/modules/ROOT/pages/8.design/8a.CapyLayering.adoc
rename to doc/modules/ROOT/pages/9.design/9a.CapyLayering.adoc
index c29fda746..4b3b2abe7 100644
--- a/doc/modules/ROOT/pages/8.design/8a.CapyLayering.adoc
+++ b/doc/modules/ROOT/pages/9.design/9a.CapyLayering.adoc
@@ -42,7 +42,7 @@ task<> echo(any_stream& stream)
     {
         auto [ec, n] = co_await stream.read_some(
             mutable_buffer(buf));
-        if(ec.failed())
+        if(ec)
             co_return;
         co_await write(stream, const_buffer(buf, n));
     }
diff --git a/doc/modules/ROOT/pages/8.design/8b.Separation.adoc b/doc/modules/ROOT/pages/9.design/9b.Separation.adoc
similarity index 100%
rename from doc/modules/ROOT/pages/8.design/8b.Separation.adoc
rename to doc/modules/ROOT/pages/9.design/9b.Separation.adoc
diff --git a/doc/modules/ROOT/pages/9.design/9c.ReadStream.adoc b/doc/modules/ROOT/pages/9.design/9c.ReadStream.adoc
new file mode 100644
index 000000000..4edfce423
--- /dev/null
+++ b/doc/modules/ROOT/pages/9.design/9c.ReadStream.adoc
@@ -0,0 +1,338 @@
+= ReadStream Concept Design
+
+== Overview
+
+This document describes the design of the `ReadStream` concept: the fundamental partial-read primitive in the concept hierarchy. It explains why `read_some` is the correct building block, how composed algorithms build on top of it, and the relationship to `ReadSource`.
+
+== Definition
+
+[source,cpp]
+----
+template<typename T>
+concept ReadStream =
+    requires(T& stream, mutable_buffer_archetype buffers)
+    {
+        { stream.read_some(buffers) } -> IoAwaitable;
+        requires awaitable_decomposes_to<
+            decltype(stream.read_some(buffers)),
+            std::error_code, std::size_t>;
+    };
+----
+
+A `ReadStream` provides a single operation:
+
+=== `read_some(buffers)` -- Partial Read
+
+Attempts to read up to `buffer_size(buffers)` bytes from the stream into the buffer sequence. Returns `(error_code, std::size_t)` where `n` is the number of bytes read.
+
+==== Semantics
+
+If `buffer_size(buffers) > 0`:
+
+- If `!ec`, then `n >= 1 && n \<= buffer_size(buffers)`. `n` bytes were read into the buffer sequence.
+- If `ec`, then `n >= 0 && n \<= buffer_size(buffers)`. `n` is the number of bytes read before the I/O condition arose.
+
+If `buffer_empty(buffers)` is true, `n` is 0. The empty buffer is not itself a cause for error, but `ec` may reflect the state of the stream.
+
+The caller must not assume the buffer is filled. `read_some` may return fewer bytes than the buffer can hold. This is the defining property of a partial-read primitive.
+
+Once `read_some` returns an error (including EOF), the caller must not call `read_some` again. The stream is done. Not all implementations can reproduce a prior error on subsequent calls, so the behavior after an error is undefined.
+
+Buffers in the sequence are filled in order.
+
+==== Error Reporting
+
+I/O conditions arising from the underlying I/O system (EOF, connection reset, broken pipe, etc.) are reported via the `error_code` component of the return value. Failures in the library wrapper itself (such as memory allocation failure) are reported via exceptions.
+
+*Throws:* `std::bad_alloc` if coroutine frame allocation fails.
+
+==== Buffer Lifetime
+
+The caller must ensure that the memory referenced by `buffers` remains valid until the `co_await` expression returns.
+
+==== Conforming Signatures
+
+[source,cpp]
+----
+template<MutableBufferSequence Buffers>
+IoAwaitable auto read_some(Buffers buffers);
+----
+
+Buffer sequences should be accepted by value when the member function is a coroutine, to ensure the sequence lives in the coroutine frame across suspension points.
+
+== Concept Hierarchy
+
+`ReadStream` is the base of the read-side hierarchy:
+
+----
+ReadStream    { read_some }
+    |
+    v
+ReadSource    { read_some, read }
+----
+
+`ReadSource` refines `ReadStream`. Every `ReadSource` is a `ReadStream`. Algorithms constrained on `ReadStream` accept both raw streams and sources. The `ReadSource` concept adds a complete-read primitive on top of the partial-read primitive.
+
+This mirrors the write side:
+
+----
+WriteStream   { write_some }
+    |
+    v
+WriteSink     { write_some, write, write_eof(buffers), write_eof() }
+----
+
+== Composed Algorithms
+
+Three composed algorithms build on `read_some`:
+
+=== `read(stream, buffers)` -- Fill a Buffer Sequence
+
+[source,cpp]
+----
+auto read(ReadStream auto& stream,
+          MutableBufferSequence auto const& buffers)
+    -> io_task<std::size_t>;
+----
+
+Loops `read_some` until the entire buffer sequence is filled or an error (including EOF) occurs. On success, `n == buffer_size(buffers)`.
+
+[source,cpp]
+----
+template<ReadStream Stream>
+task<> read_header(Stream& stream)
+{
+    char header[16];
+    auto [ec, n] = co_await read(
+        stream, mutable_buffer(header));
+    if(ec == cond::eof)
+        co_return;  // clean shutdown
+    if(ec)
+        co_return;
+    // header contains exactly 16 bytes
+}
+----
+
+=== `read(stream, dynamic_buffer)` -- Read Until EOF
+
+[source,cpp]
+----
+auto read(ReadStream auto& stream,
+          DynamicBufferParam auto&& buffers,
+          std::size_t initial_amount = 2048)
+    -> io_task<std::size_t>;
+----
+
+Reads from the stream into a dynamic buffer until EOF is reached. The buffer grows with a 1.5x factor when filled. On success (EOF), `ec` is clear and `n` is the total bytes read.
+
+[source,cpp]
+----
+template<ReadStream Stream>
+task<std::string> slurp(Stream& stream)
+{
+    std::string body;
+    auto [ec, n] = co_await read(
+        stream, string_dynamic_buffer(&body));
+    if(ec)
+        co_return {};
+    co_return body;
+}
+----
+
+=== `read_until(stream, dynamic_buffer, match)` -- Delimited Read
+
+Reads from the stream into a dynamic buffer until a delimiter or match condition is found. Used for line-oriented protocols and message framing.
+
+[source,cpp]
+----
+template<ReadStream Stream>
+task<> read_line(Stream& stream)
+{
+    std::string line;
+    auto [ec, n] = co_await read_until(
+        stream, string_dynamic_buffer(&line), "\r\n");
+    if(ec)
+        co_return;
+    // line contains data up to and including "\r\n"
+}
+----
+
+== Use Cases
+
+=== Incremental Processing with `read_some`
+
+When processing data as it arrives without waiting for a full buffer, `read_some` is the right choice. This is common for real-time data or when the processing can handle partial input.
+
+[source,cpp]
+----
+template<ReadStream Stream>
+task<> echo(Stream& stream, WriteStream auto& dest)
+{
+    char buf[4096];
+    for(;;)
+    {
+        auto [ec, n] = co_await stream.read_some(
+            mutable_buffer(buf));
+
+        auto [wec, nw] = co_await dest.write_some(
+            const_buffer(buf, n));
+
+        if(ec)
+            co_return;
+
+        if(wec)
+            co_return;
+    }
+}
+----
+
+=== Relaying from ReadStream to WriteStream
+
+When relaying data from a reader to a writer, `read_some` feeds `write_some` directly. This is the fundamental streaming pattern.
+
+[source,cpp]
+----
+template<ReadStream Src, WriteStream Dest>
+task<> relay(Src& src, Dest& dest)
+{
+    char storage[65536];
+    circular_dynamic_buffer cb(storage, sizeof(storage));
+
+    for(;;)
+    {
+        // Read into free space
+        auto mb = cb.prepare(cb.capacity());
+        auto [rec, nr] = co_await src.read_some(mb);
+        cb.commit(nr);
+
+        if(rec && rec != cond::eof)
+            co_return;
+
+        // Drain to destination
+        while(cb.size() > 0)
+        {
+            auto [wec, nw] = co_await dest.write_some(
+                cb.data());
+            if(wec)
+                co_return;
+            cb.consume(nw);
+        }
+
+        if(rec == cond::eof)
+            co_return;
+    }
+}
+----
+
+Because `ReadSource` refines `ReadStream`, this relay function also accepts `ReadSource` types. An HTTP body source or a decompressor can be relayed to a `WriteStream` using the same function.
+
+== Relationship to the Write Side
+
+[cols="1,1"]
+|===
+| Read Side | Write Side
+
+| `ReadStream::read_some`
+| `WriteStream::write_some`
+
+| `read` free function (composed)
+| `write_now` (composed, eager)
+
+| `read_until` (composed, delimited)
+| No write-side equivalent
+
+| `ReadSource::read`
+| `WriteSink::write`
+|===
+
+== Design Foundations: Why Errors May Accompany Data
+
+The `read_some` contract permits `n > 0` when `ec` is set. Data and errors are not mutually exclusive: the implementation reports exactly what happened. This is the most consequential design decision in the `ReadStream` concept, with implications for every consumer of `read_some` in the library. This section explains the design and its consequences.
+
+=== The Return Type's Purpose
+
+POSIX `read(2)` returns a single `ssize_t` -- either a byte count or -1 with `errno`. It cannot report both a byte count and an error simultaneously. When a partial transfer occurs before an error, POSIX returns the byte count on the current call and defers the error to the next. The `(error_code, size_t)` return type was designed to transcend this limitation. It can carry both values at once, allowing implementations to report partial transfers alongside the condition that stopped the transfer, as a single result.
+
+=== Departing from Asio
+
+Asio's `AsyncReadStream` concept requires `bytes_transferred == 0` on error. This was a reasonable design for an API built around POSIX-like streams, where the underlying system calls enforce binary outcomes per call. However, it imposes a burden on layered streams that do not share this limitation.
+
+A TLS stream might decrypt 100 bytes into user space, then receive a fatal alert on the next record. Under the strict rule it must either report `(!ec, 100)` now and `(ec, 0)` on the next call (requiring deferred-error bookkeeping), or report `(ec, 0)` and discard 100 valid bytes. Neither is clean. Under the relaxed rule, the TLS stream reports `(ec, 100)` honestly: here are the bytes that arrived, and here is the condition that stopped the transfer.
+
+The `ReadStream` concept permits both behaviors. Streams that naturally produce `(ec, 0)` on error (such as POSIX socket wrappers) conform. Streams that report `(ec, n)` with `n > 0` (such as TLS or compression layers) also conform. The concept imposes the weakest postcondition that all conforming types can satisfy.
+
+=== The Empty-Buffer Rule
+
+When `buffer_empty(buffers)` is true, `n` is 0. The empty buffer is not itself a cause for error, but `ec` may reflect the state of the stream.
+
+Whether the implementation performs a system call for a zero-length buffer is unspecified. A concrete type that short-circuits with `(!ec, 0)` conforms. A concrete type that forwards the zero-length call to the OS and reports whatever condition arises also conforms. The concept leaves this to the implementation.
+
+This flexibility permits zero-length operations to serve as probes (fd validation, broken pipe detection) on implementations that support it, without the concept forbidding the resulting error.
+
+=== Why EOF Is an Error
+
+EOF is reported as an error code (`cond::eof`) rather than as a success with `n == 0`, for two reasons:
+
+*Composed operations need EOF-as-error to report early termination.* The composed `read(stream, buffer(buf, 100))` promises to fill exactly 100 bytes. If the stream ends after 50, the operation did not fulfill its contract. Reporting `{success, 50}` would be misleading. Reporting `{eof, 50}` tells the caller both what happened (50 bytes landed in the buffer) and why the operation stopped (the stream ended).
+
+*EOF-as-error disambiguates the empty-buffer case from the end of a stream.* Without EOF-as-error, both `read_some(empty_buffer)` on a live stream and `read_some(non_empty_buffer)` on an exhausted stream could produce `{success, 0}`. The caller could not distinguish "I passed no buffer" from "the stream is done."
+
+=== The Canonical I/O Loop
+
+Every composed read algorithm that accumulates progress follows the same pattern:
+
+[source,cpp]
+----
+auto [ec, n] = co_await s.read_some(
+    mutable_buffer(buf + total, size - total));
+total += n;
+if(ec)
+    co_return;
+----
+
+The advance-then-check ordering is the only correct pattern. It is required for any operation that can report partial progress alongside an error -- `read` returning `(eof, 47)` being the canonical example. If the check precedes the advance, the 47 bytes are silently dropped.
+
+Under the strict rule (`n == 0` on error), the advance is a harmless no-op. Under the relaxed rule (`n >= 0` on error), the advance captures partial progress. The caller writes identical code either way. The perceived simplification of the strict rule exists only if the caller writes the check-then-advance anti-pattern, which is already incorrect for other reasons.
+
+=== Implementer Freedom
+
+Under the strict rule, every stream that might encounter an error after a partial transfer must choose between:
+
+- **Deferred errors.** Report `(!ec, k)` now, remember the error, and report `(ec, 0)` on the next call. This requires per-stream state and makes the stream's behavior depend on call history.
+- **Data loss.** Report `(ec, 0)` and discard the `k` bytes that were transferred.
+- **Internal buffering.** Copy the `k` bytes into an internal buffer and replay them on the next call. This adds allocation and copying overhead.
+
+Under the relaxed rule, the implementation reports what happened: `(ec, k)`. No deferred state, no data loss, no internal buffering.
+
+=== Consistency from Primitives Through Composed Operations
+
+The strict postcondition on `read_some` does not propagate to composed operations. The composed `read` returns `(ec, m)` where `m > 0` on failure, because it accumulates data across multiple internal `read_some` calls. The `(ec, n > 0)` case that the strict rule eliminates from `read_some` is immediately reintroduced one layer up.
+
+The relaxed postcondition avoids this inconsistency. Partial progress alongside an error code is the same pattern at every level -- from `read_some` through composed `read` -- rather than being forbidden at the primitive level and required at the composed level.
+
+=== Conforming Sources
+
+Concrete `ReadStream` implementations are free to report `n == 0` or `n > 0` on error, whichever is natural:
+
+- **TCP sockets**: `read_some` maps to a single `recv()` or `WSARecv()` call. POSIX and Windows enforce binary outcomes, so these naturally produce `(ec, 0)` on error.
+- **TLS streams**: `read_some` decrypts application data. If a fatal alert arrives after decrypting a partial record, the implementation may report `(ec, n)` with the bytes that were decrypted.
+- **HTTP content-length body**: delivers bytes up to the content-length limit. Once the limit is reached, the next `read_some` returns EOF.
+- **HTTP chunked body**: the unchunker delivers decoded data from chunks. The terminal `0\r\n\r\n` is parsed on a separate pass that returns EOF.
+- **Compression (inflate)**: the decompressor delivers output bytes. `Z_STREAM_END` may arrive alongside the final output, allowing `(eof, n)` with the last bytes.
+- **Memory source**: returns `min(requested, remaining)` bytes. May report `(eof, n)` on the final call when remaining is known, or `(eof, 0)` on a subsequent call.
+- **QUIC streams**: `read_some` returns data from received QUIC frames. Stream FIN may arrive with the last data, allowing `(eof, n)`.
+- **Buffered read streams**: `read_some` returns data from an internal buffer. EOF propagates from the underlying stream.
+- **Test mock streams**: `read_some` returns configurable data and error sequences for testing.
+
+No source is forced into an unnatural pattern. Sources that naturally separate data from errors continue to do so. Sources that naturally discover errors alongside data are free to report both.
+
+== Summary
+
+`ReadStream` provides `read_some` as the single partial-read primitive. This is deliberately minimal:
+
+- Algorithms that need to fill a buffer completely use the `read` composed algorithm.
+- Algorithms that need delimited reads use `read_until`.
+- Algorithms that need to process data as it arrives use `read_some` directly.
+- `ReadSource` refines `ReadStream` by adding `read` for complete-read semantics.
+
+The contract permits errors to accompany partial data. This uses the richer `(error_code, size_t)` return type to its full potential, avoids forcing non-POSIX streams into a deferred-error model, and produces a postcondition that is consistent from `read_some` through composed operations. The canonical advance-then-check loop handles both cases correctly with no additional call-site cost.
diff --git a/doc/modules/ROOT/pages/8.design/8d.ReadSource.adoc b/doc/modules/ROOT/pages/9.design/9d.ReadSource.adoc
similarity index 86%
rename from doc/modules/ROOT/pages/8.design/8d.ReadSource.adoc
rename to doc/modules/ROOT/pages/9.design/9d.ReadSource.adoc
index 7a00f5129..0e0a61b73 100644
--- a/doc/modules/ROOT/pages/8.design/8d.ReadSource.adoc
+++ b/doc/modules/ROOT/pages/9.design/9d.ReadSource.adoc
@@ -24,14 +24,16 @@ concept ReadSource =
 
 === `read_some(buffers)` -- Partial Read (inherited from `ReadStream`)
 
-Reads one or more bytes from the source into the buffer sequence. Returns `(error_code, std::size_t)` where `n` is the number of bytes read. May return fewer bytes than the buffer can hold.
+Attempts to read up to `buffer_size(buffers)` bytes from the source into the buffer sequence. Returns `(error_code, std::size_t)` where `n` is the number of bytes read. May return fewer bytes than the buffer can hold.
 
 ==== Semantics
 
-- On success: `!ec`, `n >= 1` and `n \<= buffer_size(buffers)`.
-- On EOF: `ec == cond::eof`, `n == 0`.
-- On error: `ec`, `n == 0`.
-- If `buffer_empty(buffers)`: completes immediately, `!ec`, `n == 0`.
+If `buffer_size(buffers) > 0`:
+
+- If `!ec`, then `n >= 1 && n \<= buffer_size(buffers)`. `n` bytes were read into the buffer sequence.
+- If `ec`, then `n >= 0 && n \<= buffer_size(buffers)`. `n` is the number of bytes read before the I/O condition arose.
+
+If `buffer_empty(buffers)` is true, `n` is 0. The empty buffer is not itself a cause for error, but `ec` may reflect the state of the source.
 
 Once `read_some` returns an error (including EOF), the caller must not call `read_some` again. The stream is done. Not all implementations can reproduce a prior error on subsequent calls, so the behavior after an error is undefined.
 
@@ -298,6 +300,13 @@ task<> relay(Src& src, Sink& dest)
     {
         auto [ec, n] = co_await src.read_some(
             mutable_buffer(buf));
+
+        auto [wec, nw] = co_await dest.write(
+            const_buffer(buf, n));
+
+        if(wec)
+            co_return;
+
         if(ec == cond::eof)
         {
             auto [wec] = co_await dest.write_eof();
@@ -305,11 +314,6 @@ task<> relay(Src& src, Sink& dest)
         }
         if(ec)
             co_return;
-
-        auto [wec, nw] = co_await dest.write(
-            const_buffer(buf, n));
-        if(wec)
-            co_return;
     }
 }
 ----
@@ -346,16 +350,16 @@ Examples of types that satisfy `ReadSource`:
 - **File source**: `read_some` is a single `read()` syscall. `read` loops until the buffer is filled or EOF.
 - **Memory source**: `read_some` returns available bytes. `read` fills the buffer from the memory region.
 
-== Why `read_some` Returns No Data on EOF
+== Errors May Accompany Data
 
-The `read_some` contract (inherited from `ReadStream`) requires that when `ec == cond::eof`, `n` is always 0. Data and EOF are delivered in separate calls. See xref:8.design/8a.ReadStream.adoc#_design_foundations_why_errors_exclude_data[ReadStream: Why Errors Exclude Data] for the full rationale. The key points:
+The `read_some` contract (inherited from `ReadStream`) permits `n > 0` when `ec` is set, including on EOF. The implementation reports exactly what happened: the bytes that arrived and the condition that stopped the transfer. See xref:9.design/9c.ReadStream.adoc#_design_foundations_why_errors_may_accompany_data[ReadStream: Why Errors May Accompany Data] for the full rationale. The key points:
 
-- The clean trichotomy (success/EOF/error, where data implies success) eliminates an entire class of bugs where callers accidentally drop the final bytes of a stream.
-- Write-side atomicity (`write_eof(buffers)`) serves correctness for protocol framing. Read-side piggybacking would be a minor optimization with significant API cost.
-- Every concrete source type naturally separates its last data delivery from its EOF indication.
-- POSIX `read()` follows the same model.
+- The `(error_code, size_t)` return type can carry both values simultaneously, transcending the POSIX limitation of reporting only one per call.
+- Layered streams (TLS, compression) may encounter an error after a partial transfer. Allowing `(ec, n)` with `n > 0` avoids forcing deferred-error bookkeeping or data loss.
+- The canonical advance-then-check loop handles both cases correctly with no additional call-site cost.
+- Concrete types that naturally produce `(ec, 0)` on error (POSIX socket wrappers) continue to do so.
 
-This contract carries over to `ReadSource` unchanged. The `read` member function (complete read) _does_ allow `n > 0` on EOF, because it is a composed loop that accumulates data across multiple internal `read_some` calls. When the underlying stream signals EOF mid-accumulation, discarding the bytes already gathered would be wrong. The caller needs `n` to know how much valid data landed in the buffer.
+This contract carries over to `ReadSource` unchanged. Both `read_some` and `read` allow `n > 0` on error or EOF, reporting the bytes that were transferred before the condition arose.
 
 == Summary
 
@@ -370,7 +374,7 @@ This contract carries over to `ReadSource` unchanged. The `read` member function
 | Function | Contract | Use Case
 
 | `ReadSource::read_some`
-| Returns one or more bytes. May fill less than the buffer.
+| Attempts to read up to `buffer_size(buffers)` bytes. May fill less than the buffer.
 | Relays, low-latency forwarding, incremental processing.
 
 | `ReadSource::read`
diff --git a/doc/modules/ROOT/pages/8.design/8e.BufferSource.adoc b/doc/modules/ROOT/pages/9.design/9e.BufferSource.adoc
similarity index 100%
rename from doc/modules/ROOT/pages/8.design/8e.BufferSource.adoc
rename to doc/modules/ROOT/pages/9.design/9e.BufferSource.adoc
diff --git a/doc/modules/ROOT/pages/8.design/8f.WriteStream.adoc b/doc/modules/ROOT/pages/9.design/9f.WriteStream.adoc
similarity index 92%
rename from doc/modules/ROOT/pages/8.design/8f.WriteStream.adoc
rename to doc/modules/ROOT/pages/9.design/9f.WriteStream.adoc
index 7e7ccaaba..42cebd6c4 100644
--- a/doc/modules/ROOT/pages/8.design/8f.WriteStream.adoc
+++ b/doc/modules/ROOT/pages/9.design/9f.WriteStream.adoc
@@ -23,16 +23,25 @@ A `WriteStream` provides a single operation:
 
 === `write_some(buffers)` -- Partial Write
 
-Writes one or more bytes from the buffer sequence. Returns `(error_code, std::size_t)` where `n` is the number of bytes written.
+Attempts to write up to `buffer_size(buffers)` bytes from the buffer sequence to the stream. Returns `(error_code, std::size_t)` where `n` is the number of bytes written.
 
 ==== Semantics
 
-- On success: `!ec`, `n >= 1` and `n \<= buffer_size(buffers)`.
-- On error: `ec`, `n == 0`.
-- If `buffer_empty(buffers)`: completes immediately, `!ec`, `n == 0`.
+If `buffer_size(buffers) > 0`:
+
+- If `!ec`, then `n >= 1 && n \<= buffer_size(buffers)`. `n` bytes were written from the buffer sequence.
+- If `ec`, then `n >= 0 && n \<= buffer_size(buffers)`. `n` is the number of bytes written before the I/O condition arose.
+
+If `buffer_empty(buffers)` is true, `n` is 0. The empty buffer is not itself a cause for error, but `ec` may reflect the state of the stream.
 
 The caller must not assume that all bytes are consumed. `write_some` may write fewer bytes than offered. This is the defining property of a partial-write primitive.
 
+==== Error Reporting
+
+I/O conditions arising from the underlying I/O system (connection reset, broken pipe, etc.) are reported via the `error_code` component of the return value. Failures in the library wrapper itself (such as memory allocation failure) are reported via exceptions.
+
+*Throws:* `std::bad_alloc` if coroutine frame allocation fails.
+
 ==== Buffer Lifetime
 
 The caller must ensure that the memory referenced by `buffers` remains valid until the `co_await` expression returns.
diff --git a/doc/modules/ROOT/pages/8.design/8g.WriteSink.adoc b/doc/modules/ROOT/pages/9.design/9g.WriteSink.adoc
similarity index 94%
rename from doc/modules/ROOT/pages/8.design/8g.WriteSink.adoc
rename to doc/modules/ROOT/pages/9.design/9g.WriteSink.adoc
index cc2b55f3a..3e9aae202 100644
--- a/doc/modules/ROOT/pages/8.design/8g.WriteSink.adoc
+++ b/doc/modules/ROOT/pages/9.design/9g.WriteSink.adoc
@@ -48,15 +48,18 @@ concept WriteSink =
 
 === `write_some(buffers)` -- Partial Write
 
-Writes one or more bytes from the buffer sequence. May consume less than the full sequence. Returns `(error_code, std::size_t)` where `n` is the number of bytes written.
+Attempts to write up to `buffer_size(buffers)` bytes from the buffer sequence to the stream. May consume less than the full sequence. Returns `(error_code, std::size_t)` where `n` is the number of bytes written.
 
 This is the low-level primitive inherited from `WriteStream`. It is appropriate when the caller manages its own consumption loop or when forwarding data incrementally without needing a complete-write guarantee.
 
 ==== Semantics
 
-- On success: `!ec`, `n >= 1`.
-- On error: `ec`, `n == 0`.
-- If `buffer_empty(buffers)`: completes immediately, `!ec`, `n == 0`.
+If `buffer_size(buffers) > 0`:
+
+- If `!ec`, then `n >= 1 && n \<= buffer_size(buffers)`. `n` bytes were written from the buffer sequence.
+- If `ec`, then `n >= 0 && n \<= buffer_size(buffers)`. `n` is the number of bytes written before the I/O condition arose.
+
+If `buffer_empty(buffers)` is true, `n` is 0. The empty buffer is not itself a cause for error, but `ec` may reflect the state of the stream.
 
 ==== When to Use
 
@@ -177,16 +180,8 @@ task<> relay(Source& src, Sink& dest)
     {
         auto [ec, n] = co_await src.read_some(
             mutable_buffer(buf));
-        if(ec == cond::eof)
-        {
-            // Signal EOF to the destination
-            auto [ec2] = co_await dest.write_eof();
-            co_return;
-        }
-        if(ec)
-            co_return;
 
-        // Interior: partial write is acceptable
+        // Forward whatever arrived before checking the error
         std::size_t written = 0;
         while(written < n)
         {
@@ -196,11 +191,19 @@ task<> relay(Source& src, Sink& dest)
                 co_return;
             written += n2;
         }
+
+        if(ec == cond::eof)
+        {
+            auto [ec2] = co_await dest.write_eof();
+            co_return;
+        }
+        if(ec)
+            co_return;
     }
 }
 ----
 
-The interior loop uses `write_some` because the relay does not need complete-write guarantees for intermediate data. When `read_some` returns EOF, `n` is 0 (per the `ReadStream` contract), so the relay signals EOF via `write_eof()` with no data.
+The interior loop uses `write_some` because the relay does not need complete-write guarantees for intermediate data. When `read_some` returns EOF, any partial bytes are forwarded first, then the relay signals EOF via `write_eof()` with no data.
 
 === Writing Complete Messages
 
@@ -352,7 +355,7 @@ A three-level hierarchy was considered, with an intermediate concept (`WriteClos
 | Function | Contract | Use Case
 
 | `write_some(buffers)`
-| Writes one or more bytes. May consume less than the full sequence.
+| Attempts to write up to `buffer_size(buffers)` bytes. May consume less than the full sequence.
 | Relay interiors, backpressure, implementing composed algorithms.
 
 | `write(buffers)`
diff --git a/doc/modules/ROOT/pages/8.design/8h.BufferSink.adoc b/doc/modules/ROOT/pages/9.design/9h.BufferSink.adoc
similarity index 100%
rename from doc/modules/ROOT/pages/8.design/8h.BufferSink.adoc
rename to doc/modules/ROOT/pages/9.design/9h.BufferSink.adoc
diff --git a/doc/modules/ROOT/pages/8.design/8i.TypeEraseAwaitable.adoc b/doc/modules/ROOT/pages/9.design/9i.TypeEraseAwaitable.adoc
similarity index 100%
rename from doc/modules/ROOT/pages/8.design/8i.TypeEraseAwaitable.adoc
rename to doc/modules/ROOT/pages/9.design/9i.TypeEraseAwaitable.adoc
diff --git a/doc/modules/ROOT/pages/8.design/8j.any_buffer_sink.adoc b/doc/modules/ROOT/pages/9.design/9j.any_buffer_sink.adoc
similarity index 99%
rename from doc/modules/ROOT/pages/8.design/8j.any_buffer_sink.adoc
rename to doc/modules/ROOT/pages/9.design/9j.any_buffer_sink.adoc
index fe31e5bc4..48e756573 100644
--- a/doc/modules/ROOT/pages/8.design/8j.any_buffer_sink.adoc
+++ b/doc/modules/ROOT/pages/9.design/9j.any_buffer_sink.adoc
@@ -153,7 +153,7 @@ The serializer never allocates a scratch buffer for the formatted output. The by
 
 == Awaitable Caching
 
-`any_buffer_sink` uses the split vtable pattern described in xref:8.design/8h.TypeEraseAwaitable.adoc[Type-Erasing Awaitables]. Multiple async operations (`commit`, `commit_eof`, plus the four `WriteSink` operations when the concrete type supports them) share a single cached awaitable storage region.
+`any_buffer_sink` uses the split vtable pattern described in xref:9.design/9i.TypeEraseAwaitable.adoc[Type-Erasing Awaitables]. Multiple async operations (`commit`, `commit_eof`, plus the four `WriteSink` operations when the concrete type supports them) share a single cached awaitable storage region.
 
 The constructor computes the maximum size and alignment across all awaitable types that the concrete type can produce and allocates that storage once. This reserves all virtual address space at construction time, so memory usage is measurable at server startup rather than growing piecemeal as requests arrive.
 
diff --git a/doc/modules/ROOT/pages/8.design/8k.Executor.adoc b/doc/modules/ROOT/pages/9.design/9k.Executor.adoc
similarity index 66%
rename from doc/modules/ROOT/pages/8.design/8k.Executor.adoc
rename to doc/modules/ROOT/pages/9.design/9k.Executor.adoc
index 1060fa8ba..bafb069b6 100644
--- a/doc/modules/ROOT/pages/8.design/8k.Executor.adoc
+++ b/doc/modules/ROOT/pages/9.design/9k.Executor.adoc
@@ -2,7 +2,7 @@
 
 == Overview
 
-This document describes the design of the `Executor` concept: the interface through which coroutines are scheduled for execution. It explains the relationship to Asio's executor model, why `dispatch` returns `void`, why `defer` was dropped, how `executor_ref` achieves zero-allocation type erasure, and the I/O completion pattern that motivates the entire design.
+This document describes the design of the `Executor` concept: the interface through which coroutines are scheduled for execution. It explains the relationship to Asio's executor model, why `dispatch` returns `std::coroutine_handle<>`, why `defer` was dropped, how `executor_ref` achieves zero-allocation type erasure, and the I/O completion pattern that motivates the entire design.
 
 The `Executor` concept exists to answer one question: when a coroutine is ready to run, _where_ does it run? The concept captures the rules for scheduling coroutine resumption, tracking outstanding work for graceful shutdown, and accessing the execution context that owns the executor. Every I/O awaitable in Corosio -- sockets, acceptors, timers, resolvers -- depends on this concept to dispatch completions back to the correct executor.
 
@@ -15,7 +15,7 @@ concept Executor =
     std::is_nothrow_copy_constructible_v<E> &&
     std::is_nothrow_move_constructible_v<E> &&
     requires(E& e, E const& ce, E const& ce2,
-             std::coroutine_handle<> h)
+             continuation c)
     {
         { ce == ce2 } noexcept -> std::convertible_to<bool>;
         { ce.context() } noexcept;
@@ -28,20 +28,22 @@ concept Executor =
         { ce.on_work_started() } noexcept;
         { ce.on_work_finished() } noexcept;
 
-        { ce.dispatch(h) };
-        { ce.post(h) };
+        { ce.dispatch(c) } -> std::same_as<std::coroutine_handle<>>;
+        { ce.post(c) };
     };
 ----
 
-An `Executor` provides exactly two operations on a coroutine handle:
+An `Executor` provides exactly two scheduling operations:
 
-=== `dispatch(h)` -- Execute If Safe
+=== `dispatch(c)` -- Execute If Safe
 
-If the executor determines it is safe (e.g., the current thread is already associated with the executor's context), resumes the coroutine inline via `h.resume()`. Otherwise, posts the coroutine for later execution. Returns `void`.
+If the executor determines it is safe (e.g., the current thread is already associated with the executor's context), returns `c.h` for symmetric transfer. Otherwise, posts the continuation for later execution and returns `std::noop_coroutine()`. The caller uses the returned handle for symmetric transfer from `await_suspend`, or calls `.resume()` at the event loop pump level.
 
-=== `post(h)` -- Always Queue
+=== `post(c)` -- Always Queue
 
-Queues the coroutine for later execution without ever executing it inline. Never blocks. Use when guaranteed asynchrony is required.
+Queues the continuation for later execution without ever executing it inline. Never blocks. The continuation is linked into the executor's internal queue via its intrusive `next` pointer -- no per-post heap allocation.
+
+Both operations accept `continuation&` rather than `std::coroutine_handle<>`. A `continuation` wraps a coroutine handle with an intrusive list pointer, enabling zero-allocation queuing.
 
 The remaining operations support context access, lifecycle management, and identity:
 
@@ -77,52 +79,36 @@ Capy retains the core elements of this model:
 Capy removes or changes:
 
 - **`defer`.** Dropped entirely. See <<why-not-defer>>.
-- **Function object submission.** Asio executors accept arbitrary callables. Capy executors accept only `std::coroutine_handle<>`. This removes the need for allocator-aware function erasure and enables a simpler, cheaper type-erased wrapper (`executor_ref`).
-- **`dispatch` return type.** Asio's `dispatch` returns void for the same reason Capy's does, but Capy also considered and rejected a `coroutine_handle<>` return for symmetric transfer. See <<why-dispatch-returns-void>>.
+- **Function object submission.** Asio executors accept arbitrary callables. Capy executors accept `continuation&` -- a coroutine handle wrapped with an intrusive queue pointer. This removes the need for allocator-aware function erasure, eliminates per-post heap allocation, and enables a simpler, cheaper type-erased wrapper (`executor_ref`).
+- **`dispatch` return type.** Asio's `dispatch` returns void. Capy's `dispatch` returns `std::coroutine_handle<>` for symmetric transfer. See <<why-dispatch-returns-handle>>.
 
 The result is a concept that preserves Asio's proven execution model while removing the machinery that a coroutine-native library does not need.
 
-[[why-dispatch-returns-void]]
-== Why `dispatch` Returns `void`
-
-An earlier design had `dispatch` return `std::coroutine_handle<>` so that callers could use it for symmetric transfer from `await_suspend`. This was rejected because it violates a fundamental constraint of the I/O layer.
-
-=== The Problem: Synchronous Completion During `await_suspend`
-
-When an I/O awaitable initiates an operation inside `await_suspend`, the I/O might complete immediately. If it does, the completion path would call `dispatch(h)` while the caller's `await_suspend` is still on the call stack. If `dispatch` resumed the coroutine inline via `h.resume()`, the coroutine would execute while `await_suspend` has not yet returned -- resuming a coroutine from inside `await_suspend` before the suspension machinery completes risks undefined behavior.
-
-The {cpp} standard describes the sequencing in https://eel.is/c++draft/expr.await[[expr.await]/5.1]:
-
-[quote]
-____
-If the result of await-ready is false, the coroutine is
-considered suspended. Then, await-suspend is evaluated.
-____
-
-Although the standard considers the coroutine suspended before `await_suspend` is called, resuming it from _within_ `await_suspend` creates a nested resumption on the same call stack. The resumed coroutine runs, potentially suspends again or completes, and then control returns into the middle of `await_suspend`. If the coroutine was destroyed during resumption, `await_suspend` returns into a destroyed frame.
-
-=== Why I/O Awaitables Return `void` or `std::noop_coroutine()`
+[[why-dispatch-returns-handle]]
+== Why `dispatch` Returns `std::coroutine_handle<>`
 
-To avoid this, all I/O awaitables return `void` or `std::noop_coroutine()` from `await_suspend`. Both forms guarantee that the caller is fully suspended and the call stack has unwound before any completion handler can resume the coroutine. The I/O operation is initiated during `await_suspend`, but the completion is dispatched later -- from the event loop, after `await_suspend` has returned.
+`dispatch` returns a `std::coroutine_handle<>` so that callers can use it for symmetric transfer from `await_suspend`. When the executor determines that inline resumption is safe, it returns `c.h` -- the caller returns this from `await_suspend` and the compiler performs a tail-call transfer to the target coroutine. When inline resumption is not safe, the executor queues the continuation and returns `std::noop_coroutine()`, which suspends the caller without resuming anything.
 
-https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p0913r1.html[P0913R1] introduced the `coroutine_handle<Z>` return type for symmetric transfer, which is the correct mechanism for coroutine-to-coroutine control transfer (as used by `task<T>` internally). But I/O awaitables cannot use it because the I/O completion is asynchronous relative to `await_suspend` -- it comes from the reactor or proactor, not from the awaitable itself.
-
-=== Consequence for `dispatch`
-
-Since the primary consumer of `dispatch` is I/O completion -- called _after_ the coroutine is suspended, from the event loop -- `dispatch` does not need to participate in symmetric transfer. It calls `h.resume()` inline when safe and returns `void`. A conforming implementation looks like:
+A conforming implementation:
 
 [source,cpp]
 ----
-void dispatch(std::coroutine_handle<> h) const
+std::coroutine_handle<> dispatch(continuation& c) const
 {
     if(ctx_.running_in_this_thread())
-        h.resume();
-    else
-        post(h);
+        return c.h;            // symmetric transfer
+    post(c);
+    return std::noop_coroutine();
 }
 ----
 
-After `dispatch` returns, the state of `h` is unspecified. The coroutine may have completed, been destroyed, or suspended at a different point. Callers must not use `h` after calling `dispatch`.
+This design enables the common fast path -- same-executor dispatch at `final_suspend` -- to avoid queuing entirely, transferring control directly via symmetric transfer.
+
+=== I/O Awaitables and Symmetric Transfer
+
+I/O awaitables return `std::noop_coroutine()` from `await_suspend` rather than a handle for symmetric transfer. The I/O operation is initiated during `await_suspend`, but completion comes from the reactor or proactor asynchronously. The awaitable cannot know which coroutine to transfer to at suspension time.
+
+Symmetric transfer from `dispatch` is used at a different level: when a child coroutine completes and its `final_suspend` dispatches the parent's continuation through the executor. If the parent is on the same executor, `dispatch` returns the parent's handle for direct symmetric transfer. If not, it queues the continuation and returns `std::noop_coroutine()`.
 
 [[why-not-defer]]
 == Why Two Operations, Not Three
@@ -171,17 +157,28 @@ When `task<T>::await_suspend` returns the parent's coroutine handle, the compile
 
 Corosio confirms this in practice: its entire I/O layer -- sockets, acceptors, timers, resolvers, signals -- across all three backends (epoll, IOCP, select) uses only `dispatch` and `post`. No code path requires `defer`.
 
-== Why `std::coroutine_handle<>`, Not Typed Handles
+== Why `continuation`, Not Raw `coroutine_handle<>`
 
-The executor accepts `std::coroutine_handle<>` -- the type-erased handle -- rather than `std::coroutine_handle<P>` for a specific promise type `P`.
+The executor accepts `continuation&` rather than `std::coroutine_handle<>`. A `continuation` wraps the handle with an intrusive `next` pointer for zero-allocation queuing:
 
-This decision has three consequences:
+[source,cpp]
+----
+struct continuation
+{
+    std::coroutine_handle<> h;
+    continuation* next = nullptr;
+};
+----
 
-- **Type erasure is possible.** `executor_ref` wraps any executor behind a uniform interface. If `dispatch` and `post` were templated on the promise type, the vtable would need to be generic over all promise types, making type erasure impractical.
+This design has three consequences:
 
-- **Executor implementations are independent of coroutine internals.** An executor schedules resumption. It does not need to know what the coroutine's promise type is, what value it produces, or how it handles exceptions. The type-erased handle provides exactly the right interface: `resume()` and nothing else.
+- **Zero-allocation posting.** The thread pool links the `continuation` directly into its work queue via `next`. No `new work(h)` per post. The queue node is embedded in the thing being queued -- the awaitable, combinator state, or trampoline promise that owns the continuation.
 
-- **I/O operation structures stay simple.** Every pending I/O operation in Corosio stores two fields: `std::coroutine_handle<> h` (a typedef for `std::coroutine_handle<>`) and `capy::executor_ref ex`. Both are type-erased. The operation structure does not need to be templated on the coroutine's promise type, which keeps the I/O backend code non-generic and out of headers.
+- **Type erasure remains possible.** `executor_ref` wraps any executor behind a uniform vtable. The vtable function pointers accept `continuation&`, which is a concrete type. No templates on promise type are needed.
+
+- **I/O operation structures stay simple.** Every I/O awaitable embeds a `continuation` for the caller's handle and an `executor_ref` for the executor. Both are non-templated, keeping I/O backend code non-generic and out of headers.
+
+The handle within the `continuation` is still type-erased (`std::coroutine_handle<>`) for the same reasons that applied before: executor implementations are independent of coroutine internals, and the type-erased handle provides exactly the right interface (`resume()` and nothing else).
 
 == Why Nothrow Copy and Move
 
@@ -297,53 +294,70 @@ The executor concept is designed around a single use case: I/O completion dispat
 
 === Capture at Initiation
 
-When a coroutine `co_await`s an I/O awaitable, the awaitable's `await_suspend` receives the caller's executor and stores it as `executor_ref`:
+When a coroutine `co_await`s an I/O awaitable, the awaitable's `await_suspend` receives the caller's handle and executor. The awaitable embeds a `continuation` for the caller's handle:
 
 [source,cpp]
 ----
-template<typename Ex>
-auto await_suspend(
+std::coroutine_handle<>
+await_suspend(
     std::coroutine_handle<> h,
-    Ex const& ex) -> std::coroutine_handle<>
+    io_env const* env) noexcept
 {
-    // ex is captured as executor_ref in the operation
-    return socket_.connect(h, ex, endpoint_, token_, &ec_);
+    cont_.h = h;
+    ex_ = env->executor;
+    // ... initiate I/O operation ...
+    return std::noop_coroutine();
 }
 ----
 
-The operation structure stores both the coroutine handle and the executor reference:
+=== Dispatch at Completion
+
+When the I/O completes (from the reactor thread for epoll, the completion port for IOCP, or the select loop), the awaitable uses the stored executor to resume the coroutine via the embedded continuation:
 
 [source,cpp]
 ----
-struct io_op : scheduler_op
-{
-    std::coroutine_handle<> h;
-    capy::executor_ref ex;
-    // ... error codes, buffers, etc.
-};
+// Timer fires or I/O completes:
+ex_.post(cont_);
 ----
 
-=== Dispatch at Completion
+`post` links the continuation into the executor's work queue via `cont_.next`. No heap allocation occurs -- the continuation is embedded in the awaitable, which is alive for the duration of the suspension. A worker thread dequeues the continuation and calls `cont_.h.resume()`.
 
-When the I/O completes (from the reactor thread for epoll, the completion port for IOCP, or the select loop), the operation uses the stored executor to resume the coroutine:
+=== Platform Independence
+
+This pattern is identical across all three Corosio backends: epoll (Linux), IOCP (Windows), and select (POSIX fallback). The executor concept and `executor_ref` provide the abstraction that makes this possible. The backend-specific code deals with I/O readiness or completion notification. The executor-specific code deals with coroutine scheduling. The two concerns are cleanly separated.
+
+== Frame Allocator Preservation
+
+Capy propagates frame allocators via thread-local storage (see xref:../4.coroutines/4g.allocators.adoc#_thread_local_propagation[Thread-Local Propagation]). The TLS value is set in `await_resume` when a coroutine resumes and read in `operator new` when a child coroutine is created. Between these two points, the coroutine body executes arbitrary user code.
+
+If that user code resumes a coroutine from a different chain on the same thread -- by calling `.resume()` directly, pumping a dispatch queue, or running nested event loop work -- the other coroutine's `await_resume` overwrites TLS. The original coroutine's next child then allocates from the wrong resource.
+
+=== The Save/Restore Protocol
+
+The fix is to save and restore TLS around every `.resume()` call:
 
 [source,cpp]
 ----
-void operator()() override
+inline void
+safe_resume(std::coroutine_handle<> h) noexcept
 {
-    // ... set error codes ...
-    capy::executor_ref saved_ex(std::move(ex));
-    std::coroutine_handle<> saved_h(std::move(h));
-    impl_ptr.reset();
-    saved_ex.dispatch(saved_h);
+    auto* saved = get_current_frame_allocator();
+    h.resume();
+    set_current_frame_allocator(saved);
 }
 ----
 
-`dispatch` checks whether the current thread is already running on the executor's context. If so, the coroutine resumes inline. If not, the coroutine is posted for later execution on the correct context.
+This makes TLS behave like a stack. Each nested resume pushes its own allocator; when the coroutine suspends and `.resume()` returns, the previous value is restored. The cost is two TLS accesses (one read, one write) per `.resume()` call -- negligible compared to the cost of resuming a coroutine.
 
-=== Platform Independence
+=== Where It Applies
 
-This pattern is identical across all three Corosio backends: epoll (Linux), IOCP (Windows), and select (POSIX fallback). The executor concept and `executor_ref` provide the abstraction that makes this possible. The backend-specific code deals with I/O readiness or completion notification. The executor-specific code deals with coroutine scheduling. The two concerns are cleanly separated.
+All executor event loops and strand dispatch loops must use `safe_resume` instead of calling `.resume()` directly. Capy's `thread_pool`, `blocking_context`, and `strand_queue` all use it internally.
+
+Two `.resume()` call sites intentionally do _not_ use `safe_resume`:
+
+* **`symmetric_transfer`** (MSVC workaround). The calling coroutine is about to suspend unconditionally. When it later resumes, `await_resume` restores TLS from the promise's stored environment. Save/restore would add overhead with no benefit.
+
+* **`run_async_wrapper::operator()`**. TLS is already saved in the wrapper's constructor and restored in its destructor, which bracket the entire task lifetime.
 
 == Why Not `std::execution` (P2300)
 
@@ -358,7 +372,7 @@ start(op);                           //   -- too late
 
 For coroutines, this ordering is fatal. Coroutine frame allocation happens _before_ the coroutine body executes. The compiler calls `operator new` first, then constructs the promise, then begins execution. Any mechanism that provides the allocator _after_ the coroutine call -- receiver queries, `await_transform`, explicit method calls -- arrives after the frame is already allocated with the wrong (or default) allocator.
 
-Capy's model flows context _forward_ from launcher to task. The `run_async(ex, alloc)(my_task())` two-phase invocation sets the thread-local allocator _before_ the task expression is evaluated, so `operator new` reads it in time. This is described in detail in xref:8.design/8l.RunApi.adoc[Run API].
+Capy's model flows context _forward_ from launcher to task. The `run_async(ex, alloc)(my_task())` two-phase invocation sets the thread-local allocator _before_ the task expression is evaluated, so `operator new` reads it in time. This is described in detail in xref:9.design/9l.RunApi.adoc[Run API].
 
 The same forward-flowing model applies to executors. The launcher binds the executor before the task runs. The task's promise stores the executor and propagates it to nested awaitables via `await_transform`. Context flows from caller to callee at every level, never backward.
 
@@ -376,8 +390,8 @@ public:
     void on_work_started() const noexcept;
     void on_work_finished() const noexcept;
 
-    void dispatch(std::coroutine_handle<> h) const;
-    void post(std::coroutine_handle<> h) const;
+    std::coroutine_handle<> dispatch(continuation& c) const;
+    void post(continuation& c) const;
 
     bool operator==(my_executor const&) const noexcept;
 };
@@ -385,6 +399,6 @@ public:
 
 == Summary
 
-The `Executor` concept provides `dispatch` and `post` for coroutine scheduling, work tracking for event loop lifetime, and `context()` for service access. The design descends from Asio's executor model but is adapted for coroutines: `defer` is replaced by symmetric transfer, function objects are replaced by `std::coroutine_handle<>`, and `dispatch` returns `void` because I/O completions are dispatched after suspension, not during it.
+The `Executor` concept provides `dispatch` and `post` for coroutine scheduling, work tracking for event loop lifetime, and `context()` for service access. The design descends from Asio's executor model but is adapted for coroutines: `defer` is replaced by symmetric transfer, function objects are replaced by `continuation&` for zero-allocation intrusive queuing, and `dispatch` returns `std::coroutine_handle<>` for symmetric transfer at `final_suspend`.
 
-`executor_ref` type-erases any executor into two pointers, enabling platform-independent I/O completion dispatch with zero allocation and predictable cache behavior. The capture-at-initiation / dispatch-at-completion pattern is the fundamental use case the concept serves.
+`executor_ref` type-erases any executor into two pointers, enabling platform-independent I/O completion dispatch with zero allocation and predictable cache behavior. The capture-at-initiation / dispatch-at-completion pattern is the fundamental use case the concept serves.
\ No newline at end of file
diff --git a/doc/modules/ROOT/pages/8.design/8l.RunApi.adoc b/doc/modules/ROOT/pages/9.design/9l.RunApi.adoc
similarity index 99%
rename from doc/modules/ROOT/pages/8.design/8l.RunApi.adoc
rename to doc/modules/ROOT/pages/9.design/9l.RunApi.adoc
index 2c9fb6db0..a29495f22 100644
--- a/doc/modules/ROOT/pages/8.design/8l.RunApi.adoc
+++ b/doc/modules/ROOT/pages/9.design/9l.RunApi.adoc
@@ -232,7 +232,7 @@ Coroutine frame allocation happens _before_ the coroutine body executes. When th
 
 Any mechanism that injects the allocator _after_ the call -- receiver queries, `await_transform`, explicit method calls -- arrives too late. The frame is already allocated.
 
-This is the fundamental tension identified in D4003 �3.3:
+This is the fundamental tension identified in D4003 section 3.3:
 
 [quote]
 ____
diff --git a/doc/modules/ROOT/pages/8.design/8m.WhyNotCobalt.adoc b/doc/modules/ROOT/pages/9.design/9m.WhyNotCobalt.adoc
similarity index 98%
rename from doc/modules/ROOT/pages/8.design/8m.WhyNotCobalt.adoc
rename to doc/modules/ROOT/pages/9.design/9m.WhyNotCobalt.adoc
index a0b1635da..b28a2a6b5 100644
--- a/doc/modules/ROOT/pages/8.design/8m.WhyNotCobalt.adoc
+++ b/doc/modules/ROOT/pages/9.design/9m.WhyNotCobalt.adoc
@@ -134,7 +134,7 @@ Templates can achieve this by type-erasing every customization point. The cost m
 
 == Stream Concepts
 
-Capy defines seven coroutine-only stream concepts. Cobalt inherits Asio's `AsyncReadStream` and `AsyncWriteStream`, which are hybrid concepts supporting callbacks, futures, and coroutines. Cobalt's `cobalt::io` wrappers simplify the API and Cobalt defines stream abstractions (`write_stream`, `read_stream`, `stream`) as abstract base classes, a distinct approach from Capy's concept-based hierarchy. Cobalt's wrappers still include full Asio headers. See xref:8.design/8n.WhyNotCobaltConcepts.adoc[Write Stream Design] for a detailed comparison of the two approaches.
+Capy defines seven coroutine-only stream concepts. Cobalt inherits Asio's `AsyncReadStream` and `AsyncWriteStream`, which are hybrid concepts supporting callbacks, futures, and coroutines. Cobalt's `cobalt::io` wrappers simplify the API and Cobalt defines stream abstractions (`write_stream`, `read_stream`, `stream`) as abstract base classes, a distinct approach from Capy's concept-based hierarchy. Cobalt's wrappers still include full Asio headers. See xref:9.design/9n.WhyNotCobaltConcepts.adoc[Write Stream Design] for a detailed comparison of the two approaches.
 
 Capy's concepts form a refinement hierarchy that emerged naturally from use-case-first design:
 
@@ -192,7 +192,7 @@ Traditional approaches to type erasure in Asio focus on the lowest-level element
 
 Capy type-erases the stream itself. This is possible because coroutines provide structural type erasure — the continuation is always a handle, not a template parameter. When the library is coroutines-only, one virtual call per I/O operation is the total cost. The completion handler, executor, and allocator do not need individual erasure because they are not part of the stream's operation signature.
 
-Cobalt defines stream abstractions (`write_stream`, `read_stream`, `stream`) as abstract base classes in `cobalt/io/stream.hpp`, taking a different approach from Capy's concept + type-erased wrapper model. See xref:8.design/8n.WhyNotCobaltConcepts.adoc[Write Stream Design] for a side-by-side analysis.
+Cobalt defines stream abstractions (`write_stream`, `read_stream`, `stream`) as abstract base classes in `cobalt/io/stream.hpp`, taking a different approach from Capy's concept + type-erased wrapper model. See xref:9.design/9n.WhyNotCobaltConcepts.adoc[Write Stream Design] for a side-by-side analysis.
 
 The wrappers compose. `any_buffer_source` also satisfies `ReadSource` — natively if the wrapped type supports both, synthesized otherwise. `any_buffer_sink` also satisfies `WriteSink`. You pick the abstraction level you need.
 
@@ -245,7 +245,7 @@ This is how the Dimovian Ideal is mechanically achieved.
 
 == Mock Streams and Testability
 
-When algorithms operate on type-erased interfaces, testing becomes deterministic. Capy provides mock implementations for every stream concept. Cobalt defines stream abstractions as abstract base classes but does not provide mock implementations for testing. See xref:8.design/8n.WhyNotCobaltConcepts.adoc[Write Stream Design] for a comparison of the two stream designs.
+When algorithms operate on type-erased interfaces, testing becomes deterministic. Capy provides mock implementations for every stream concept. Cobalt defines stream abstractions as abstract base classes but does not provide mock implementations for testing. See xref:9.design/9n.WhyNotCobaltConcepts.adoc[Write Stream Design] for a comparison of the two stream designs.
 
 Capy's mock types:
 
@@ -433,23 +433,11 @@ Capy has one `DynamicBuffer` concept. The v1/v2 split in Asio exists because of
 | Yes
 |
 
-| `buffer_pair`
-| Yes
-|
-
-| `slice`
-| Yes
-|
-
 | `front`
 | Yes
 |
 
-| `consuming_buffers`
-| Yes
-|
-
-| `buffer_array`
+| `buffer_slice`
 | Yes
 |
 
diff --git a/doc/modules/ROOT/pages/8.design/8n.WhyNotCobaltConcepts.adoc b/doc/modules/ROOT/pages/9.design/9n.WhyNotCobaltConcepts.adoc
similarity index 95%
rename from doc/modules/ROOT/pages/8.design/8n.WhyNotCobaltConcepts.adoc
rename to doc/modules/ROOT/pages/9.design/9n.WhyNotCobaltConcepts.adoc
index 8884e24b0..7a0c8b8ff 100644
--- a/doc/modules/ROOT/pages/8.design/8n.WhyNotCobaltConcepts.adoc
+++ b/doc/modules/ROOT/pages/9.design/9n.WhyNotCobaltConcepts.adoc
@@ -298,17 +298,22 @@ Capy's `WriteStream` concept includes semantic requirements in the concept's doc
 
 // Semantic Requirements:
 //
-// If buffer_size( buffers ) > 0, the operation writes one or more
-// bytes of data to the stream from the buffer sequence:
+// Attempts to write up to buffer_size( buffers ) bytes from
+// the buffer sequence to the stream.
 //
-//   On success: !ec, and n is the number of bytes written.
-//   On error: ec, and n is 0.
+// If buffer_size( buffers ) > 0:
 //
-// If buffer_empty( buffers ) is true, the operation completes
-// immediately. !ec, and n is 0.
+//   If !ec, then n >= 1 && n <= buffer_size( buffers ).
+//     n bytes were written from the buffer sequence.
+//   If ec, then n >= 0 && n <= buffer_size( buffers ).
+//     n is the number of bytes written before the I/O
+//     condition arose.
 //
-// Buffers in the sequence are written completely before proceeding
-// to the next buffer.
+// If buffer_empty( buffers ) is true, n is 0. The empty
+// buffer is not itself a cause for error, but ec may reflect
+// the state of the stream.
+//
+// Buffers in the sequence are consumed in order.
 //
 // Buffer Lifetime:
 //
@@ -336,7 +341,7 @@ The concept also includes a coroutine-specific warning about buffer lifetime:
 |
 
 | Error reporting semantics
-| Documented (`ec` + `n == 0`)
+| Documented (`ec` + `n >= 0 && n \<= buffer_size`)
 |
 
 | Partial write guarantees
@@ -395,7 +400,7 @@ ____
 
 The documentation describes the mechanical role of each function pointer but does not specify what the `implementation` function must do with the buffer, what completion semantics to follow, how to report errors through the `completion_handler`, or under what conditions `try_implementation` should complete synchronously. Implementors can look to the existing I/O wrappers (e.g., `stream_socket`) as reference implementations.
 
-In Capy, the implementation contract lives in the `WriteStream` concept definition. A type satisfies `WriteStream` by providing a `write_some` member function template that returns an `IoAwaitable` decomposing to `(error_code, std::size_t)`. The semantic requirements are part of the concept. There is no separate operation type to construct and no function pointers to provide.
+In Capy, the implementation contract lives in the `WriteStream` concept definition. A type satisfies `WriteStream` by providing a `write_some` member function template that await-returns `(error_code, std::size_t)`. The semantic requirements are part of the concept. There is no separate operation type to construct and no function pointers to provide.
 
 [cols="1,1,1"]
 |===
diff --git a/doc/modules/ROOT/pages/8.design/8o.WhyNotTMC.adoc b/doc/modules/ROOT/pages/9.design/9o.WhyNotTMC.adoc
similarity index 100%
rename from doc/modules/ROOT/pages/8.design/8o.WhyNotTMC.adoc
rename to doc/modules/ROOT/pages/9.design/9o.WhyNotTMC.adoc
diff --git a/doc/modules/ROOT/pages/A.specification-methods/A.intro.adoc b/doc/modules/ROOT/pages/A.specification-methods/A.intro.adoc
new file mode 100644
index 000000000..83184b6ef
--- /dev/null
+++ b/doc/modules/ROOT/pages/A.specification-methods/A.intro.adoc
@@ -0,0 +1,16 @@
+//
+// Copyright (c) 2026 Andrzej Krzemieński (akrzemi1@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+= Methods of API Description
+
+
+
+This section describes the conventions used to specify the API of this library in the following 
+xref:reference:boost/capy.adoc[Reference] section.
+
diff --git a/doc/modules/ROOT/pages/A.specification-methods/Ab.cancellation.adoc b/doc/modules/ROOT/pages/A.specification-methods/Ab.cancellation.adoc
new file mode 100644
index 000000000..145b7b1c4
--- /dev/null
+++ b/doc/modules/ROOT/pages/A.specification-methods/Ab.cancellation.adoc
@@ -0,0 +1,20 @@
+//
+// Copyright (c) 2026 Andrzej Krzemieński (akrzemi1@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+= Cancellation
+
+A function is said to _support IoAwaitable cancellation_ when its return type 
+models concept `IoAwaitable` and this return object `a` controls a coroutine which
+can be prematurely stopped using the `std::stop_token` propagated through the 
+`IoAwaitable` protocol. Additionally, if the result type of expression `co_await a`
+in the context of a Capy-coroutine is a specialization of `io_result` 
+then the cancelling of an operation is 
+considered a contingency represented by condition `cond::canceled`.
+
+
diff --git a/doc/modules/ROOT/pages/A.specification-methods/Ac.contingencies.adoc b/doc/modules/ROOT/pages/A.specification-methods/Ac.contingencies.adoc
new file mode 100644
index 000000000..54ccc9caa
--- /dev/null
+++ b/doc/modules/ROOT/pages/A.specification-methods/Ac.contingencies.adoc
@@ -0,0 +1,44 @@
+//
+// Copyright (c) 2026 Andrzej Krzemieński (akrzemi1@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+= Contingencies
+
+A _contingency_ is any situation occurring during an operation
+on a stream, caused by the stream's state, that prevents this operation
+from reading or writing the requested number of bytes.
+
+These situations do not violate the postconditions of the corresponding operations, 
+as their postconditions never say that the requested number of bytes will indeed be
+processed.
+
+Each stream operation that may encounter a contingency await-returns
+a type which is a specialization of `capy::io_result`. These objects can be _destructured_
+using a structured binding. The first binding of such destructuring is of type
+`std::error_code`. This binding, call it `ec`, is used to signal if and which
+contingency occured:
+
+ * If `ec == std::error_code{}`, no contingency occurred.
+ * Otherwise a contingency occurred. In order to determine which contingency occurred,
+   compare `ec` to error conditions, in particular to `capy::cond`.
+
+NOTE: Reaching the end of stream is also a contingency
+      (which can be interpreted as preventing an infinite read
+      from proceeding).
+
+NOTE: The stream operations can still throw exceptions to indicate conditions
+      unrelated to stream state that prevent these operations from satisfying
+      their postconditions, such as failures to grow a buffer, or 
+      failure to allocate a coroutine frame.
+
+NOTE: Operations on streams often await-return `capy::io_result<std::size_t>`
+      destructuring to `[ec, n]`, where `n` represents the number of processed bytes.
+      Upon a reported contingency, a non-zero `n` indicates the state of the partial
+      or sometimes even a full read. When an inner operation reports a contingency,
+      the outer operation usually processes its partial results before reporting
+      the contingency itself.
\ No newline at end of file
diff --git a/doc/modules/ROOT/pages/index.adoc b/doc/modules/ROOT/pages/index.adoc
index bd37d0769..a0153eef3 100644
--- a/doc/modules/ROOT/pages/index.adoc
+++ b/doc/modules/ROOT/pages/index.adoc
@@ -87,10 +87,13 @@ task<> echo(any_stream& stream)
     for(;;)
     {
         auto [ec, n] = co_await stream.read_some(mutable_buffer(buf));
-        if(ec.failed())
-            co_return;
+
         auto [wec, wn] = co_await write(stream, const_buffer(buf, n));
-        if(wec.failed())
+
+        if(ec)
+            co_return;
+
+        if(wec)
             co_return;
     }
 }
@@ -111,8 +114,8 @@ The `task<>` return type (equivalent to `task<void>`) creates a lazy coroutine t
 == Next Steps
 
 * xref:quick-start.adoc[Quick Start] — Set up your first Capy project
-* xref:cpp20-coroutines/foundations.adoc[{cpp}20 Coroutines Tutorial] — Learn coroutines from the ground up
-* xref:concurrency/foundations.adoc[Concurrency Tutorial] — Understand threads, mutexes, and synchronization
-* xref:coroutines/tasks.adoc[Coroutines in Capy] — Deep dive into `task<T>` and the IoAwaitable protocol
-* xref:buffers/overview.adoc[Buffer Sequences] — Master the concept-driven buffer model
-* xref:streams/overview.adoc[Stream Concepts] — Understand the six stream concepts
+* xref:2.cpp20-coroutines/2a.foundations.adoc[{cpp}20 Coroutines Tutorial] — Learn coroutines from the ground up
+* xref:3.concurrency/3a.foundations.adoc[Concurrency Tutorial] — Understand threads, mutexes, and synchronization
+* xref:4.coroutines/4a.tasks.adoc[Coroutines in Capy] — Deep dive into `task<T>` and the IoAwaitable protocol
+* xref:5.buffers/5a.overview.adoc[Buffer Sequences] — Master the concept-driven buffer model
+* xref:6.streams/6a.overview.adoc[Stream Concepts] — Understand the six stream concepts
diff --git a/doc/modules/ROOT/pages/quick-start.adoc b/doc/modules/ROOT/pages/quick-start.adoc
index f8ddf18a0..d04ef68f5 100644
--- a/doc/modules/ROOT/pages/quick-start.adoc
+++ b/doc/modules/ROOT/pages/quick-start.adoc
@@ -115,6 +115,6 @@ capy::run_async(executor)(might_fail(),
 
 Now that you have a working program:
 
-* xref:coroutines/tasks.adoc[Tasks] — Learn how lazy tasks work
-* xref:coroutines/launching.adoc[Launching Tasks] — Understand `run_async` in detail
-* xref:coroutines/affinity.adoc[Executor Affinity] — Control where coroutines execute
+* xref:4.coroutines/4a.tasks.adoc[Tasks] — Learn how lazy tasks work
+* xref:4.coroutines/4b.launching.adoc[Launching Tasks] — Understand `run_async` in detail
+* xref:4.coroutines/4c.executors.adoc[Executors and Execution Contexts] — Control where coroutines execute
diff --git a/doc/modules/ROOT/pages/why-capy.adoc b/doc/modules/ROOT/pages/why-capy.adoc
index 55aa1a068..8fe3a0268 100644
--- a/doc/modules/ROOT/pages/why-capy.adoc
+++ b/doc/modules/ROOT/pages/why-capy.adoc
@@ -2,7 +2,7 @@
 
 Boost.Asio is currently the world leader in portable asynchronous I/O. The standard is silent here. The global ecosystem offers nothing comparable.
 
-*Capy is the first offering which surpasses Boost.Asio in its domain*
+*Capy advances beyond Boost.Asio in several specific domains*
 
 The sections that follow will demonstrate this claim. Each section examines a domain where Capy innovates—not by reinventing what works, but by solving problems that have remained unsolved.
 
@@ -14,7 +14,7 @@ But Asio made a pragmatic choice: support every continuation style. Callbacks. F
 
 Capy makes a different choice. It commits fully to coroutines. This isn't a limitation—it's a liberation. When you know the continuation is always a coroutine, you can optimize in ways that hybrid approaches cannot. The frame is always there. The executor context propagates naturally. Cancellation flows downward without ceremony.
 
-No other library in existence offers coroutine-only stream concepts. Capy is the first.
+Asio's stream concepts are hybrid by design. Capy's are coroutine-only, which is what enables the optimizations above.
 
 === What Capy Offers
 
@@ -24,44 +24,30 @@ No other library in existence offers coroutine-only stream concepts. Capy is the
 
 === Comparison
 
-[cols="1,1,1,1"]
+[cols="1,1"]
 |===
-| Capy | Asio | std | World
+| Capy | Asio
 
 | `ReadStream`
 | `AsyncReadStream`*
-|
-|
 
 | `WriteStream`
 | `AsyncWriteStream`*
-|
-|
 
 | `Stream`
-|
-|
-|
+^| -
 
 | `ReadSource`
-|
-|
-|
+^| -
 
 | `WriteSink`
-|
-|
-|
+^| -
 
 | `BufferSource`
-|
-|
-|
+^| -
 
 | `BufferSink`
-|
-|
-|
+^| -
 |===
 
 *Asio's concepts are hybrid (callbacks/futures/coroutines), not coroutine-only
@@ -78,8 +64,6 @@ Coroutines change this equation. A coroutine's continuation is always the same t
 
 Write `any_stream&` and accept any stream. Your function compiles once. It links anywhere. Your build times drop. Your binaries shrink. Your error messages become readable. And because coroutines are ordinary functions (not templates), you get natural ABI stability. Link against a new stream implementation without recompiling your code.
 
-No other library in the world does this. Boost would be first.
-
 === What Capy Offers
 
 * `any_read_stream`, `any_write_stream`, `any_stream` — type-erased partial I/O
@@ -89,69 +73,45 @@ No other library in the world does this. Boost would be first.
 
 === Comparison
 
-[cols="1,1,1,1"]
+[cols="1,1"]
 |===
-| Capy | Asio | std | World
+| Capy | Asio
 
 | `any_read_stream`
-|
-|
-|
+^| -
 
 | `any_write_stream`
-|
-|
-|
+^| -
 
 | `any_stream`
-|
-|
-|
+^| -
 
 | `any_read_source`
-|
-|
-|
+^| -
 
 | `any_write_sink`
-|
-|
-|
+^| -
 
 | `any_buffer_source`
-|
-|
-|
+^| -
 
 | `any_buffer_sink`
-|
-|
-|
+^| -
 
 | `read`
 | `async_read`*
-|
-|
 
 | `write`
 | `async_write`*
-|
-|
 
 | `read_until`
 | `async_read_until`*
-|
-|
 
 | `push_to`
-|
-|
-|
+^| -
 
 | `pull_from`
-|
-|
-|
+^| -
 |===
 
 *Asio's algorithms only support `AsyncReadStream` and `AsyncWriteStream`
@@ -162,7 +122,7 @@ Asio got buffer sequences right. The concept-driven approach—`ConstBufferSeque
 
 Capy doesn't reinvent this. We adopt Asio's buffer sequence model because it works.
 
-But we improve on it. Asio provides the basics; Capy extends them. Need to trim bytes from the front of a buffer sequence? Asio makes you work for it. Capy provides `slice`, `front`, `consuming_buffers`—customization points for efficient byte-level manipulation. Need a circular buffer for protocol parsing? Capy has `circular_dynamic_buffer`. Need to compose two buffers without copying? `buffer_pair`.
+But we improve on it. Asio provides the basics; Capy extends them. Need to trim bytes from the front of a buffer sequence? Asio makes you work for it. Capy provides `buffer_slice` and `front`—byte-range slicing primitives for efficient byte-level manipulation. Need a circular buffer for protocol parsing? Capy has `circular_dynamic_buffer`. Need to compose two buffers without copying? Use `std::array<const_buffer, 2>` (or any range of buffers) directly — Capy's buffer-sequence concepts accept arbitrary ranges.
 
 And then there's the `DynamicBuffer` mess. If you've used Asio, you've encountered the confusing split between `DynamicBuffer_v1` and `DynamicBuffer_v2`. This exists because of a fundamental problem: when an async operation takes a buffer by value and completes via callback, who owns the buffer? The original design had flaws. The "fix" created two incompatible versions. (See https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2018/p1100r0.html[P1100R0] for the full story.)
 
@@ -173,94 +133,59 @@ One more thing: `std::ranges` cannot help here. `ranges::size` returns the numbe
 === What Capy Offers
 
 * `ConstBufferSequence`, `MutableBufferSequence`, `DynamicBuffer` — core concepts (Asio-compatible)
-* `flat_dynamic_buffer`, `circular_dynamic_buffer`, `buffer_pair` — additional concrete types
-* `slice`, `front`, `buffer_array`, `consuming_buffers` — byte-level manipulation utilities
+* `flat_dynamic_buffer`, `circular_dynamic_buffer` — additional concrete buffer types
+* `buffer_slice`, `front` — byte-level manipulation utilities
 
 === Comparison
 
-[cols="1,1,1,1"]
+[cols="1,1"]
 |===
-| Capy | Asio | std | World
+| Capy | Asio
 
 | `ConstBufferSequence`
 | `ConstBufferSequence`
-|
-|
 
 | `MutableBufferSequence`
 | `MutableBufferSequence`
-|
-|
 
 | `DynamicBuffer`
 | `DynamicBuffer_v1`/`v2`*
-|
-|
 
 | `const_buffer`
 | `const_buffer`
-|
-|
 
 | `mutable_buffer`
 | `mutable_buffer`
-|
-|
 
 | `flat_dynamic_buffer`
-|
-|
-|
+^| -
 
 | `circular_dynamic_buffer`
-|
-|
-|
+^| -
 
 | `vector_dynamic_buffer`
 | `dynamic_vector_buffer`
-|
-|
 
 | `string_dynamic_buffer`
 | `dynamic_string_buffer`
-|
-|
 
-| `buffer_pair`
-|
-|
-|
+| `buffer_slice`
+^| -
 
-| `consuming_buffers`
-|
-|
-|
+| `Slice`
+^| -
 
-| `slice`
-|
-|
-|
+| `MutableSlice`
+^| -
 
 | `front`
-|
-|
-|
-
-| `buffer_array`
-|
-|
-|
+^| -
 
 | `buffer_copy`
 | `buffer_copy`
-|
-|
 
 | Byte-level trimming
-|
-|
-|
+^| -
 |===
 
 *Asio has confusing v1/v2 split due to callback composition problems
@@ -293,7 +218,7 @@ And Capy *separates execution from platform*. The execution model—executors, c
 
 Most importantly, Capy defines a *taxonomy of awaitables*. `IoAwaitable` is the base protocol for any type that participates in context propagation. `IoRunnable` refines it with the launch interface needed by `run_async` and `run`. This hierarchy means you can write your own task types that integrate with Capy's execution model. Asio's `awaitable<T>` is a concrete type, not a concept. You use it or you don't. Capy gives you building blocks.
 
-No other solution like this exists. Not Asio. Not `std::execution`. Not anywhere in the global ecosystem. Capy is the first.
+Neither Asio nor `std::execution` offers this combination of forward-flow allocator control, automatic stop-token propagation, and execution/platform separation.
 
 === What Capy Offers
 
@@ -305,99 +230,81 @@ No other solution like this exists. Not Asio. Not `std::execution`. Not anywhere
 
 === Comparison
 
-[cols="1,1,1,1"]
+[cols="1,1,1"]
 |===
-| Capy | Asio | std | World
+| Capy | Asio | std
 
 | `IoAwaitable`
-|
-|
-|
+^| -
+^| -
 
 | `IoRunnable`
-|
-|
-|
+^| -
+^| -
 
 | `io_awaitable_promise_base`
-|
-|
-|
+^| -
+^| -
 
 | `task<T>`
 | `awaitable<T>`*
 | P3552R3**
-|
 
 | `run`
-|
-|
-|
+^| -
+^| -
 
 | `run_async`
 | `co_spawn`*
-|
-|
+^| -
 
 | `strand`
 | `strand`
-|
-|
+^| -
 
 | `executor_ref`
 | `any_executor`
-|
-|
+^| -
 
 | `thread_pool`
 | `thread_pool`
-| `static_thread_pool`
-|
+^| -
 
 | `execution_context`
 | `execution_context`
-|
-|
+^| -
 
 | `frame_allocator`
-|
-|
-|
+^| -
+^| -
 
 | `recycling_memory_resource`
-|
-|
-|
+^| -
+^| -
 
 | `async_mutex`
-|
-|
-|
+^| -
+^| -
 
 | `async_event`
-|
-|
-|
+^| -
+^| -
 
 | `stop_token` propagation
-|
+^| -
 | `stop_token`***
-|
 
 | User-defined task types
-|
-|
-|
+^| -
+^| -
 
 | Execution/platform isolation
-|
-|
-|
+^| -
+^| -
 
 | Forward-flow allocator control
-|
-|
-|
+^| -
+^| -
 |===
 
 *Asio's are not extensible, no concept taxonomy
@@ -408,9 +315,9 @@ No other solution like this exists. Not Asio. Not `std::execution`. Not anywhere
 
 == The Road Ahead
 
-For twenty five years, Boost.Asio has stood alone. It defined what portable asynchronous I/O looks like in {cpp}. No serious competitor offering its depth of offerings has appeared. It defined the promising Networking TS. Asio earned its place through years of production use, careful evolution, and relentless focus on real problems faced by real developers.
+For twenty-five years, Boost.Asio has stood alone. It defined what portable asynchronous I/O looks like in {cpp}. No serious competitor offering its depth of offerings has appeared. It defined the promising Networking TS. Asio earned its place through years of production use, careful evolution, and relentless focus on real problems faced by real developers.
 
-Capy builds on Asio's foundation—the buffer sequences, the executor model, the hard-won lessons about what works. But where Asio must preserve compatibility with over decades of existing code, Capy is free to commit fully to the future. {cpp}20 coroutines are not an afterthought here. They are the foundation.
+Capy builds on Asio's foundation—the buffer sequences, the executor model, the hard-won lessons about what works. But where Asio must preserve compatibility with decades of existing code, Capy is free to commit fully to the future. {cpp}20 coroutines are not an afterthought here. They are the foundation.
 
 The result is something new. Stream concepts designed for coroutines alone. Type-erasure at the level where it matters most. A simple execution model discovered through use-case-first design. Clean separation between execution and platform. A taxonomy of awaitables that invites extension rather than mandating a single concrete type.
 
@@ -418,6 +325,6 @@ Meanwhile, the {cpp} standards committee has produced `std::execution`—a sende
 
 Boost has always been where the practical meets the principled. Where real-world feedback shapes design. Where code ships before papers standardize. Capy continues this tradition.
 
-If you are reading this as a Boost contributor, know what you are part of. This is the first library to advance beyond Asio in the domains where they overlap. Not by abandoning what works, but by building on it. Not by chasing theoretical purity, but by solving the problems that have frustrated {cpp} developers for years: template explosion, compile-time costs, error message novels, ergonomic concurrency, and more.
+If you are reading this as a Boost contributor, know what you are part of. This library advances beyond Asio in the domains where they overlap. Not by abandoning what works, but by building on it. Not by chasing theoretical purity, but by solving the problems that have frustrated {cpp} developers for years: template explosion, compile-time costs, error message novels, ergonomic concurrency, and more.
 
 The coroutine era has arrived. And Boost, as it has so many times before, is leading the way.
diff --git a/doc/mrdocs.yml b/doc/mrdocs.yml
index 7a5365349..2560be736 100644
--- a/doc/mrdocs.yml
+++ b/doc/mrdocs.yml
@@ -26,4 +26,7 @@ multipage: true
 # use-system-libc: true
 # use-system-stdlib: true
 
+# Automation
+auto-function-metadata: false
+
 cmake: '-DCMAKE_CXX_STANDARD=20 -DBOOST_CAPY_MRDOCS_BUILD=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=OFF'
diff --git a/doc/quitter.md b/doc/quitter.md
new file mode 100644
index 000000000..4a1a1414f
--- /dev/null
+++ b/doc/quitter.md
@@ -0,0 +1,481 @@
+# `quitter<T>` - Stop-Aware Task Type
+
+## Summary
+
+`quitter<T>` is a task type that satisfies `IoRunnable` and provides
+transparent cancellation. When the stop token is triggered, the
+coroutine body never sees the cancellation. The promise intercepts
+it on resume and short-circuits to `final_suspend`. RAII cleanup
+runs normally. The parent sees a "stopped" completion. The coroutine
+author writes zero cancellation-handling code.
+
+## Motivation
+
+Ville Voutilainen (the emeritus chair of EWG in WG21) identified a gap in Capy's `task<T>`:
+when an application needs to shut down (e.g. Qt "close all windows"),
+every in-flight coroutine should exit cleanly without the programmer
+writing `if (ec) co_return;` after every `co_await`. The sender model
+achieves this via `set_stopped` - the coroutine never resumes. Capy's
+`task<T>` resumes with `operation_aborted` and requires explicit
+handling.
+
+`quitter<T>` closes this gap. It is a policy-level task type: same
+IoAwaitable protocol, same io_env propagation, same frame allocator,
+same symmetric transfer - but with automatic stop-on-resume behavior.
+
+## Behavior
+
+### Normal operation (stop not requested)
+
+Identical to `task<T>`. The coroutine runs, co_awaits I/O operations,
+returns a value or throws. No observable difference.
+
+### When stop is requested
+
+1. The stop token in `io_env` is triggered (from a parent, a scope,
+   or application shutdown).
+2. The in-flight I/O operation observes the token and completes
+   (typically with `operation_aborted`).
+3. The I/O awaitable resumes the coroutine via symmetric transfer.
+4. **Before the coroutine body sees the result**, the promise's
+   `transform_awaiter::await_resume()` checks the stop token.
+5. If `env_->stop_token.stop_requested()` is true, the awaiter
+   throws a sentinel exception (`detail::stop_requested_exception`).
+6. `unhandled_exception()` catches it and sets a `stopped_` flag
+   instead of storing the exception.
+7. `final_suspend` returns the continuation via symmetric transfer.
+8. The parent's `await_resume()` observes the stopped flag.
+
+### Why throw internally?
+
+Throwing inside the coroutine is the only portable mechanism to
+jump from an arbitrary suspension point to `final_suspend` while
+running destructors for all in-scope locals. The exception never
+escapes the coroutine - `unhandled_exception()` catches it. The
+user never sees it. No `exception_ptr` is stored. The cost is one
+throw + catch per cancellation, which happens once per coroutine
+lifetime (not per operation).
+
+Alternative: a compiler-level `coroutine_handle<>::destroy()` from
+outside would also run destructors, but the parent loses the
+completion signal. The internal-throw approach preserves structured
+completion.
+
+## Interface
+
+### Header
+
+```
+<boost/capy/quitter.hpp>
+```
+
+### Synopsis
+
+```cpp
+namespace boost::capy {
+
+template<typename T = void>
+struct [[nodiscard]] quitter
+{
+    struct promise_type;
+
+    // IoAwaitable
+    bool await_ready() const noexcept;
+    auto await_resume();
+    std::coroutine_handle<> await_suspend(
+        std::coroutine_handle<> cont,
+        io_env const* env);
+
+    // IoRunnable
+    std::coroutine_handle<promise_type> handle() const noexcept;
+    void release() noexcept;
+
+    ~quitter();
+    quitter(quitter&&) noexcept;
+    quitter& operator=(quitter&&) noexcept;
+    quitter(quitter const&) = delete;
+    quitter& operator=(quitter const&) = delete;
+};
+
+} // namespace boost::capy
+```
+
+### `promise_type`
+
+```cpp
+struct quitter<T>::promise_type
+    : io_awaitable_promise_base<promise_type>
+    , detail::task_return_base<T>
+{
+    // Completion state
+    enum class completion { running, value, exception, stopped };
+
+    completion state_ = completion::running;
+    union { std::exception_ptr ep_; };
+
+    promise_type() noexcept;
+    ~promise_type();
+
+    quitter get_return_object();
+    auto initial_suspend() noexcept;   // same as task<T>
+    auto final_suspend() noexcept;     // same as task<T>
+
+    void unhandled_exception();
+    std::exception_ptr exception() const noexcept;
+    bool stopped() const noexcept;
+
+    template<class Awaitable>
+    auto transform_awaitable(Awaitable&& a);
+};
+```
+
+## Implementation Details
+
+### `detail::stop_requested_exception`
+
+```cpp
+namespace boost::capy::detail {
+
+struct stop_requested_exception {};
+
+} // namespace boost::capy::detail
+```
+
+A lightweight sentinel type. Not derived from `std::exception`.
+Never escapes the coroutine. Never stored in `exception_ptr`.
+
+### `promise_type::unhandled_exception()`
+
+```cpp
+void unhandled_exception()
+{
+    try {
+        throw;
+    }
+    catch (detail::stop_requested_exception const&) {
+        state_ = completion::stopped;
+    }
+    catch (...) {
+        new (&ep_) std::exception_ptr(std::current_exception());
+        state_ = completion::exception;
+    }
+}
+```
+
+### `promise_type::exception()` and `stopped()`
+
+```cpp
+std::exception_ptr exception() const noexcept
+{
+    if (state_ == completion::exception)
+        return ep_;
+    return {};
+}
+
+bool stopped() const noexcept
+{
+    return state_ == completion::stopped;
+}
+```
+
+### `transform_awaiter` (the key difference from `task<T>`)
+
+```cpp
+template<class Awaitable>
+struct transform_awaiter
+{
+    std::decay_t<Awaitable> a_;
+    promise_type* p_;
+
+    bool await_ready() noexcept
+    {
+        return a_.await_ready();
+    }
+
+    template<class Promise>
+    auto await_suspend(std::coroutine_handle<Promise> h) noexcept
+    {
+        using R = decltype(
+            a_.await_suspend(h, p_->environment()));
+        if constexpr (std::is_same_v<
+            R, std::coroutine_handle<>>)
+            return detail::symmetric_transfer(
+                a_.await_suspend(h, p_->environment()));
+        else
+            return a_.await_suspend(
+                h, p_->environment());
+    }
+
+    decltype(auto) await_resume()
+    {
+        set_current_frame_allocator(
+            p_->environment()->frame_allocator);
+
+        // THE KEY DIFFERENCE: check stop token before
+        // returning to the coroutine body
+        if (p_->environment()->stop_token.stop_requested())
+            throw detail::stop_requested_exception{};
+
+        return a_.await_resume();
+    }
+};
+```
+
+### `quitter::await_resume()`
+
+```cpp
+auto await_resume()
+{
+    if (h_.promise().stopped())
+        throw detail::stop_requested_exception{};
+    if (h_.promise().state_ == promise_type::completion::exception)
+        std::rethrow_exception(h_.promise().ep_);
+    if constexpr (!std::is_void_v<T>)
+        return std::move(*h_.promise().result_);
+}
+```
+
+When a `quitter` is awaited inside another `quitter`, the stopped
+state propagates upward: the parent's `await_resume` throws
+`stop_requested_exception`, which the parent's `unhandled_exception`
+catches and sets `stopped_`. The chain unwinds automatically.
+
+When a `quitter` is awaited inside a regular `task<T>`, the
+`stop_requested_exception` becomes an unhandled exception in the
+`task`. The `task`'s `await_resume` rethrows it. The caller must
+handle it. This is by design: `task<T>` does not opt into automatic
+cancellation.
+
+### `quitter::await_suspend()`
+
+Identical to `task<T>::await_suspend()`:
+
+```cpp
+std::coroutine_handle<> await_suspend(
+    std::coroutine_handle<> cont,
+    io_env const* env)
+{
+    h_.promise().set_continuation(cont);
+    h_.promise().set_environment(env);
+    return h_;
+}
+```
+
+### Everything else
+
+`initial_suspend`, `final_suspend`, `get_return_object`,
+`return_value`/`return_void`, frame allocation (`operator
+new`/`delete`), `handle()`, `release()`, move operations,
+destructor - all identical to `task<T>`.
+
+## Concept Satisfaction
+
+`quitter<T>` satisfies:
+
+- `IoAwaitable` - has `await_suspend(coroutine_handle<>, io_env const*)`
+- `IoRunnable` - has `promise_type`, `handle()`, `release()`,
+  `exception()`, `set_continuation()`, `set_environment()`,
+  `result()` (non-void)
+
+`quitter<T>` is usable with `run`, `run_async`, `when_all`,
+`when_any`, and any launcher that accepts `IoRunnable`.
+
+## Interaction with `when_all` / `when_any`
+
+`when_all` and `when_any` create a child `stop_source` and register
+a `stop_callback` on the parent token. When the parent requests
+stop, the child token is triggered. Each child `quitter` observes
+the token on its next resume and short-circuits. All children exit
+cleanly. The parent combinator sees the stopped completions and
+propagates accordingly.
+
+No changes to `when_all` or `when_any` are needed. The stop
+propagation already works through `io_env`. The only new behavior
+is inside `quitter`'s `transform_awaiter::await_resume()`.
+
+## Usage
+
+### Basic
+
+```cpp
+quitter<> session(tcp_socket& sock)
+{
+    char buf[1024];
+    auto [ec, n] = co_await sock.read_some(buf);
+    // If stop was requested, we never reach here.
+    // RAII cleanup for buf (trivial) and sock (if owned)
+    // runs automatically.
+    if (ec)
+        co_return;
+    co_await process(buf, n);
+}
+```
+
+### Application shutdown
+
+```cpp
+quitter<> run_server(tcp_acceptor& acc)
+{
+    while (true)
+    {
+        auto [ec, sock] = co_await acc.accept();
+        if (ec)
+            co_return;
+        co_await scope.spawn(session(sock));
+    }
+    // When the application calls stop_source.request_stop(),
+    // the next co_await in this coroutine (or any child)
+    // short-circuits. All sessions exit. All sockets close
+    // via RAII. The server exits cleanly.
+}
+```
+
+### Mixing with `task<T>`
+
+```cpp
+task<> outer()
+{
+    // If the quitter is stopped, its await_resume throws
+    // stop_requested_exception. The task sees it as an
+    // unhandled exception. The task's caller must handle it.
+    co_await some_quitter_function();
+}
+```
+
+This is intentional: `task<T>` does not opt into automatic
+cancellation. The programmer who uses `task<T>` has chosen
+explicit error handling.
+
+## Test Plan
+
+### Unit tests (in `test/unit/quitter.cpp`)
+
+1. **Normal completion** - `quitter<int>` returns a value.
+   Verify `await_resume()` returns the value. Verify
+   `stopped()` is false.
+
+2. **Void completion** - `quitter<>` completes normally.
+
+3. **Exception propagation** - `quitter<>` throws. Verify
+   `exception()` is non-null. Verify `stopped()` is false.
+
+4. **Stop before first co_await** - Request stop before the
+   quitter starts its first I/O operation. Verify the first
+   `co_await` short-circuits. Verify `stopped()` is true.
+   Verify RAII destructors ran (use a counting guard).
+
+5. **Stop during I/O** - Start an I/O operation, request stop
+   while it is in flight. Verify the coroutine does not resume
+   into the body. Verify `stopped()` is true. Verify RAII
+   destructors ran.
+
+6. **Stop propagation through chain** - `quitter` awaits
+   `quitter` awaits `quitter`. Request stop. Verify all three
+   report `stopped()`. Verify all RAII destructors ran.
+
+7. **Stop propagation with when_all** - `when_all` of two
+   `quitter` tasks. Request stop. Verify both stop. Verify
+   the `when_all` completes (does not hang).
+
+8. **Stop propagation with when_any** - `when_any` of two
+   `quitter` tasks. One completes normally. Verify the other
+   is stopped.
+
+9. **Mixing quitter and task** - `task<>` awaits `quitter<>`.
+   Request stop. Verify the `quitter` stops. Verify the
+   `task` sees `stop_requested_exception` as an unhandled
+   exception.
+
+10. **No stop requested** - Run a `quitter` to completion
+    without ever requesting stop. Verify identical behavior
+    to `task<T>`.
+
+11. **RAII verification** - Use a struct with a destructor
+    that increments a counter. Declare it in the coroutine
+    body before a `co_await`. Request stop. Verify the
+    counter was incremented (destructor ran).
+
+12. **Multiple co_await** - `quitter` does three sequential
+    I/O operations. Request stop after the second completes.
+    Verify the third `co_await` short-circuits. Verify the
+    first two results were processed normally.
+
+### Integration tests
+
+13. **TCP echo with shutdown** - Start a TCP echo server
+    using `quitter`. Connect a client. Exchange data. Request
+    stop. Verify the server exits cleanly and the socket is
+    closed.
+
+14. **Timer cancellation** - `quitter` awaits a long timer.
+    Request stop. Verify the timer is cancelled and the
+    coroutine exits promptly.
+
+## Files to Create
+
+| File | Contents |
+|------|----------|
+| `include/boost/capy/quitter.hpp` | `quitter<T>` class template |
+| `include/boost/capy/detail/stop_requested_exception.hpp` | Sentinel exception type |
+| `test/unit/quitter.cpp` | Unit tests |
+
+## Files to Modify
+
+| File | Change |
+|------|--------|
+| `include/boost/capy.hpp` | Add `#include <boost/capy/quitter.hpp>` |
+| `test/unit/CMakeLists.txt` | Add `quitter.cpp` to test sources |
+
+## Design Decisions
+
+**Why a separate type instead of a flag on `task<T>`?**
+
+The task type is a policy point. Users who want explicit error
+handling use `task<T>`. Users who want transparent cancellation
+use `quitter<T>`. The IoAwaitable protocol does not mandate a
+task. This is the design P4003 enables.
+
+**Why throw internally instead of `goto final_suspend`?**
+
+C++ coroutines have no mechanism to jump from an arbitrary
+suspension point to `final_suspend` other than throwing. The
+throw runs destructors for all in-scope locals. The exception
+is caught in `unhandled_exception()` and never escapes. The
+cost is one throw per cancellation per coroutine lifetime.
+
+**Why check in `await_resume` instead of `await_suspend`?**
+
+`await_suspend` has already committed to suspending. The I/O
+operation is in flight. Checking the stop token in `await_suspend`
+would race with the I/O completion. Checking in `await_resume`
+is safe: the operation has completed, the coroutine is about to
+resume, and we intercept before the body sees the result.
+
+**Why not check in `initial_suspend::await_resume`?**
+
+We could, and should. If the stop token is already triggered
+when the quitter starts, the first `initial_suspend::await_resume`
+should throw `stop_requested_exception` immediately. Add this
+check to `initial_suspend`'s awaiter:
+
+```cpp
+void await_resume() const noexcept(false)
+{
+    set_current_frame_allocator(
+        p_->environment()->frame_allocator);
+    if (p_->environment()->stop_token.stop_requested())
+        throw detail::stop_requested_exception{};
+}
+```
+
+Note: this changes `initial_suspend::await_resume` from `noexcept`
+to potentially-throwing. This is safe because `unhandled_exception()`
+will catch it.
+
+## Execution Protocol
+
+Save output after each complete semantic unit or to-do item (never
+mid-paragraph). Always save output BEFORE marking plan items done -
+never the reverse. On resumption: read the plan and last ~30 lines
+of the output file. Repair any truncated tail. Continue from where
+output ends, matching existing style. Never rewrite prior content.
diff --git a/doc/read-some-rationale.md b/doc/read-some-rationale.md
new file mode 100644
index 000000000..36399f800
--- /dev/null
+++ b/doc/read-some-rationale.md
@@ -0,0 +1,579 @@
+# Design Rationale: read_some Error Postconditions
+
+## Context
+
+This document captures the design space and trade-offs around
+`ReadStream::read_some` postconditions when an error occurs. The central
+question is whether the concept should guarantee `n == 0` on error, or
+permit `n >= 0` on error. A secondary question is the behavior when the
+caller passes a zero-length buffer. The analysis applies symmetrically
+to `WriteStream::write_some`.
+
+The consensus was reached through discussion between Peter Dimov and
+Andrzej Krzemieński, whose arguments shaped both the postcondition
+choice (E2) and the zero-length buffer semantics (Z3).
+
+## Current Consensus
+
+The current consensus adopts E2 (error permits `n >= 0`) and Z3
+(empty buffers are not an error). The `ReadStream` and `WriteStream`
+concepts are declared with the following contracts:
+
+```cpp
+/** Concept for types providing awaitable read operations.
+
+    A type satisfies ReadStream if it provides a read_some
+    member function template that accepts any MutableBufferSequence
+    and await-returns (error_code, std::size_t).
+
+    Semantic Requirements:
+
+    Attempts to read up to buffer_size( buffers ) bytes from
+    the stream into the buffer sequence.
+
+    If buffer_size( buffers ) > 0:
+
+    - If !ec, then n >= 1 && n <= buffer_size( buffers ). n bytes
+      were read into the buffer sequence.
+    - If ec, then n >= 0 && n <= buffer_size( buffers ). n is the
+      number of bytes read before the I/O condition arose.
+
+    If buffer_empty( buffers ) is true, n is 0. The empty buffer
+    is not itself a cause for error, but ec may reflect the state
+    of the stream.
+
+    Buffers in the sequence are filled in order.
+
+    Error Reporting:
+
+    I/O conditions arising from the underlying I/O system (EOF,
+    connection reset, broken pipe, etc.) are reported via the
+    error_code component of the return value. Failures in the
+    library wrapper itself (such as memory allocation failure)
+    are reported via exceptions.
+
+    Throws: std::bad_alloc if coroutine frame allocation fails.
+*/
+template< typename T >
+concept ReadStream =
+    requires( T& stream, mutable_buffer_archetype buffers )
+    {
+        { stream.read_some( buffers ) } -> IoAwaitable;
+        requires awaitable_decomposes_to<
+            decltype( stream.read_some( buffers ) ),
+            std::error_code, std::size_t >;
+    };
+```
+
+```cpp
+/** Concept for types providing awaitable write operations.
+
+    A type satisfies WriteStream if it provides a write_some
+    member function template that accepts any ConstBufferSequence
+    and await-returns (error_code, std::size_t).
+
+    Semantic Requirements:
+
+    Attempts to write up to buffer_size( buffers ) bytes from
+    the buffer sequence to the stream.
+
+    If buffer_size( buffers ) > 0:
+
+    - If !ec, then n >= 1 && n <= buffer_size( buffers ). n bytes
+      were written from the buffer sequence.
+    - If ec, then n >= 0 && n <= buffer_size( buffers ). n is the
+      number of bytes written before the I/O condition arose.
+
+    If buffer_empty( buffers ) is true, n is 0. The empty buffer
+    is not itself a cause for error, but ec may reflect the state
+    of the stream.
+
+    Buffers in the sequence are consumed in order.
+
+    Error Reporting:
+
+    I/O conditions arising from the underlying I/O system (EOF,
+    connection reset, broken pipe, etc.) are reported via the
+    error_code component of the return value. Failures in the
+    library wrapper itself (such as memory allocation failure)
+    are reported via exceptions.
+
+    Throws: std::bad_alloc if coroutine frame allocation fails.
+*/
+template< typename T >
+concept WriteStream =
+    requires( T& stream, const_buffer_archetype buffers )
+    {
+        { stream.write_some( buffers ) } -> IoAwaitable;
+        requires awaitable_decomposes_to<
+            decltype( stream.write_some( buffers ) ),
+            std::error_code, std::size_t >;
+    };
+```
+
+E2 is also chosen for consistency: composed operations like `read`
+return partial progress alongside errors by necessity (there is no
+other way to report how many bytes were transferred before EOF). If
+`read_some` adopted E1, callers would need one loop style for
+`read_some` and a different one for `read`. Under E2, the same
+advance-then-check pattern is correct everywhere.
+
+The rationale for these choices follows.
+
+## Background
+
+### The read_some Contract
+
+`read_some` accepts a buffer sequence and returns `(error_code, size_t)`.
+When `buffer_size(buffers) > 0`, the non-error case is uncontroversial:
+
+- **No error:** `!ec`, and `n >= 1` (at least one byte transferred).
+
+The disputes concern the error case and the empty-buffer case.
+
+### The Canonical I/O Loop
+
+Every composed read algorithm that accumulates progress follows the same
+pattern:
+
+```cpp
+auto [ec, n] = co_await s.read_some(
+    mutable_buffer( buf + total, size - total ) );
+total += n;
+if( ec )
+    co_return;
+```
+
+The advance-then-check ordering is the only correct pattern for I/O loops.
+It is required for any function that can report partial progress alongside
+an error - `read` returning `(eof, 47)` being the canonical example. If
+the check precedes the advance, the 47 bytes are silently dropped.
+
+This ordering is correct under both postconditions. Under the strict rule
+(`n == 0` on error), the advance is a harmless no-op. Under the relaxed
+rule (`n >= 0` on error), the advance captures partial progress. The
+caller writes identical code either way.
+
+### Concepts vs. Concrete Types
+
+A concept specifies the least set of requirements that generic code may
+rely on. Concrete types that model the concept may offer stronger
+guarantees as refinements. The concept does not prevent a concrete type
+from being more specific; it only determines what generic algorithms
+can assume.
+
+### The memcpy Precedent
+
+C++ historically gave `memcpy` a precondition that the source and
+destination pointers must be non-null, even when the byte count is zero.
+This forced every call site to guard with `if( n > 0 )`, despite the
+obvious meaning of copying zero bytes from a null range. C++23 corrected
+this by allowing null pointers when the count is zero.
+
+The zero-length buffer question in `read_some` is structurally identical.
+`(nullptr, 0)` is an established representation of an empty range, and
+requiring callers to guard against it creates friction with no
+corresponding benefit.
+
+## The Zero-Length Buffer Question
+
+Three options exist for `read_some` when `buffer_empty(buffers)` is true:
+
+### Option Z1: Precondition (buffer_size > 0)
+
+Passing a zero-length buffer is undefined behavior.
+
+**Arguments for:**
+
+1. Avoids inconsistencies between platforms in the zero-length case.
+2. Simplifies the specification by removing a corner case.
+
+**Arguments against:**
+
+1. Crashes are almost never desirable in a server. Unnecessary
+   preconditions increase risk in environments with adversarial inputs
+   and concurrency.
+2. Empty ranges are legitimate. Protocols with length-prefixed payloads
+   naturally produce zero-length payloads. Framing code that calls
+   `write(payload)` after writing the length header should not need a
+   guard for the empty case.
+3. Repeats the `memcpy` mistake that C++23 corrected.
+4. Forces every caller that might receive a zero-length buffer to wrap
+   the call with `if( buffer_size(buffers) > 0 )`. The wrapping is
+   pure friction.
+5. Generic code constrained by `ReadStream` cannot distinguish "the
+   buffer happened to be empty" from a bug. Crashing on a legitimate
+   edge case is hostile to generic programming.
+
+### Option Z2: Error (return EINVAL)
+
+Return `(EINVAL, 0)` for a zero-length buffer.
+
+**Arguments for:**
+
+1. No crash. The caller receives a well-defined result.
+2. Signals that the call was unusual, allowing the caller to distinguish
+   the empty case from a real I/O error if desired.
+
+**Arguments against:**
+
+1. A zero-length buffer is not invalid input. It is the degenerate case
+   of a legitimate operation, just as copying zero bytes is the
+   degenerate case of `memcpy`.
+2. Returning an error for a non-error condition forces every call site
+   that might produce empty buffers to either pre-check or post-filter
+   `EINVAL` out of the error path.
+3. For `write`, the analysis is clear: `write` returns either
+   `(0, n)` on success or `(ec, m)` with `m < n` on failure.
+   Substituting `n = 0`, only the success case `(0, 0)` satisfies
+   both conditions. The error case cannot arise.
+
+### Option Z3: Not an Error (n == 0, ec unspecified)
+
+The empty buffer is not itself an error condition: no EINVAL, no
+precondition violation. `n` is 0. Whether `ec` is set depends on the
+implementation - the operation may report errors arising from the state
+of the stream (e.g., a closed fd or broken pipe). Whether a system call
+occurs is unspecified.
+
+**Arguments for:**
+
+1. No guard needed at call sites. Generic code that might produce empty
+   buffers works without special-casing.
+2. Matches the corrected `memcpy` semantics in C++23: empty input is
+   permitted, not an error in itself.
+3. The concept leaves both system call behavior and `ec` unspecified,
+   which is the weakest useful postcondition. Concrete types that
+   short-circuit empty buffers with `(!ec, 0)` conform. Concrete types
+   that forward the zero-length call to the OS and report whatever
+   error arises also conform.
+4. Permits zero-length operations to serve as probes (fd validation,
+   broken pipe detection) without the concept forbidding the resulting
+   error.
+
+**Arguments against:**
+
+1. Callers cannot rely on `(!ec, 0)` for empty buffers. A caller that
+   passes an empty buffer expecting guaranteed success must handle the
+   possibility of an error from the underlying I/O state. In practice
+   this is not a burden because correct I/O code already checks `ec`
+   unconditionally.
+
+**Recommendation:** Option Z3. The concept should specify the weakest
+useful postcondition. The only guarantee is `n == 0`; everything else
+is left to the implementation.
+
+## The Error Postcondition Question
+
+When `buffer_size(buffers) > 0` and the operation fails, two
+postconditions are under consideration:
+
+### Option E1: Error Implies n == 0
+
+On error, `ec` is set and `n` is guaranteed to be 0.
+
+**Arguments for:**
+
+1. **Clean partition of the return space.** The return values form a
+   non-overlapping table:
+
+   | Condition    | ec            | n      |
+   | ------------ | ------------- | ------ |
+   | Empty buffer | unspecified   | `0`    |
+   | Success      | `!ec`         | `>= 1` |
+   | Error        | `ec`          | `0`    |
+
+   `n > 0` alone tells you the operation succeeded. When `n == 0` and
+   `ec` is set, the operation failed. When `n == 0` and `!ec`, the
+   buffer was empty.
+
+2. **`total` is unambiguous in generic code.** After the canonical loop,
+   `total` represents bytes that were transferred without any error
+   during the transfer that produced them. Under the relaxed rule,
+   `total` might include bytes from a transfer that also reported an
+   error, and only type-specific knowledge can determine whether those
+   bytes are trustworthy.
+
+3. **Concrete types can refine upward.** If `tcp_socket::read_some`
+   wants to expose POSIX short-read-on-error semantics, it can document
+   `n >= 0` on error as a refinement. The concept does not prevent
+   concrete types from having richer postconditions; it just means
+   generic code does not rely on them.
+
+4. **Matches POSIX semantics.** POSIX `read(2)` and `write(2)` return a
+   single `ssize_t` - either a byte count or -1 with `errno`. They
+   never report both simultaneously. When a partial transfer occurs
+   before an error, POSIX returns the byte count on the current call and
+   the error on the next. A `read_some` wrapper around POSIX naturally
+   produces `(ec, 0)` on error with no special effort. E1 is the direct
+   translation of the POSIX model into `(error_code, size_t)`.
+
+**Arguments against:**
+
+1. **Forces non-POSIX streams to emulate POSIX's deferred-error model.**
+   POSIX splits partial-transfer-then-error into two calls because its
+   API returns a single value. The `(error_code, size_t)` return type
+   was designed to transcend this limitation. Layered streams (TLS,
+   compression, protocol framers) may encounter an error after a partial
+   transfer as a single atomic event. Under E1, they must artificially
+   split this into two calls - reporting success now and deferring the
+   error - reproducing a POSIX API limitation that the richer return
+   type was meant to eliminate.
+
+2. **Inverts the meaning of "least common denominator."** A concept
+   should impose the weakest postcondition that all conforming types can
+   satisfy. Every stream can satisfy `n >= 0` on error. Not every stream
+   can satisfy `n == 0` on error without extra work. The strict rule is a
+   *stronger* requirement on implementers, which is the opposite of what
+   a least-common-denominator concept should impose.
+
+3. **Loses information.** If the implementation transferred bytes into
+   the buffer before encountering an error, reporting `(ec, 0)` instead
+   of `(ec, k)` means the caller cannot know the buffer was partially
+   written. For layered streams (TLS, compression, protocol framing),
+   this lost information can matter for resynchronization or error
+   recovery.
+
+4. **Forces implementations to lie or buffer.** A TLS stream might
+   decrypt 100 bytes into user space, then receive a fatal alert on the
+   next record. Under the strict rule it must either report `(!ec, 100)`
+   now and `(ec, 0)` on the next call (requiring deferred-error
+   bookkeeping), or report `(ec, 0)` and discard 100 valid bytes.
+   Neither is clean.
+
+### Option E2: Error Permits n >= 0
+
+On error, `ec` is set and `n` may be any value from 0 to the buffer
+size.
+
+**Arguments for:**
+
+1. **Transcends the POSIX API limitation.** POSIX `read(2)` cannot
+   report a byte count and an error simultaneously because it returns a
+   single `ssize_t`. The `(error_code, size_t)` return type can. E2
+   allows implementations to use this capability, reporting partial
+   transfers alongside errors as a single result rather than
+   artificially splitting them across two calls.
+
+2. **Strictly weaker postcondition.** Any stream that can satisfy E1
+   trivially satisfies E2. The reverse is not true. E2 admits a strictly
+   larger set of conforming types.
+
+3. **Preserves maximum information.** The caller receives the byte count
+   regardless of error state. Callers who do not care can ignore `n` on
+   error (the advance is a no-op when `n == 0`). Callers who do care
+   get the data they need.
+
+4. **Implementer freedom.** The concept does not dictate how the
+   implementation manages partial I/O internally. If TLS decrypted 100
+   bytes before the alert, it can report `(ec, 100)` honestly. No
+   deferred-error bookkeeping, no discarded data.
+
+5. **Zero additional call-site cost.** The canonical I/O loop advances
+   before checking the error:
+
+   ```cpp
+   total += n;
+   if( ec )
+       co_return;
+   ```
+
+   This pattern is already mandatory for every function that can report
+   partial progress alongside an error (EOF with partial data being the
+   canonical case). The relaxed postcondition does not change the code
+   callers must write - it only changes whether the advance is a no-op
+   or captures real progress.
+
+**Arguments against:**
+
+1. **Ambiguity of partial bytes.** When `read_some` returns
+   `(connection_reset, 42)`, are those 42 bytes valid and complete? The
+   concept cannot answer this because the answer depends on the concrete
+   stream. Generic code that accumulated them into `total` may be
+   reporting a byte count that includes bytes from a failed transfer.
+
+2. **Return space is no longer cleanly partitioned.** The table becomes:
+
+   | Condition    | ec            | n      |
+   | ------------ | ------------- | ------ |
+   | Empty buffer | unspecified   | `0`    |
+   | Success      | `!ec`         | `>= 1` |
+   | Error        | `ec`          | `>= 0` |
+
+   The error row overlaps with success on `n > 0`. `n > 0` is no longer
+   a proxy for success; `ec` must always be checked independently.
+
+3. **Concrete types can refine upward.** This argument applies
+   symmetrically: the concept can say `n == 0` on error without
+   preventing concrete types from documenting `n >= 0`. The concept
+   constrains generic code, not concrete implementations.
+
+## Analysis
+
+### The Call-Site Argument Is Phantom
+
+The strongest intuitive argument for E1 is that it simplifies call sites.
+This does not survive scrutiny. The canonical I/O loop must advance before
+checking the error regardless of which postcondition the concept chooses,
+because the advance-then-check pattern is required for correctness with
+any function that reports partial progress (such as `read` reporting
+bytes transferred before EOF). The code is identical under both rules:
+
+```cpp
+auto [ec, n] = co_await s.read_some( buf );
+total += n;
+if( ec )
+    co_return;
+```
+
+Under E1, the advance on error is a no-op (`n == 0`). Under E2, the
+advance may capture real progress. In neither case does the caller write
+different code. The perceived simplification of E1 exists only if the
+caller writes the check-then-advance anti-pattern, which is already
+incorrect for other reasons.
+
+### The Concept's Job
+
+A concept specifies what generic code may rely on. The question is: does
+generic code benefit from knowing `n == 0` on error?
+
+The primary consumer is the accumulation loop. As shown above, it writes
+identical code either way. The secondary consumer is code that inspects
+`n > 0` as a proxy for success. Under E1, this proxy works. Under E2,
+the caller must check `ec` independently. But checking `ec` is what
+correct I/O code does regardless - using `n > 0` as a success proxy is
+a shortcut that masks the error status, not a pattern the concept should
+encourage.
+
+### The Implementer's Burden
+
+Under E1, every stream that might encounter an error after partial
+transfer must choose between:
+
+- **Deferred errors.** Report `(!ec, k)` now, remember the error, and
+  report `(ec, 0)` on the next call. This requires per-stream state and
+  makes the stream's behavior depend on call history.
+- **Data loss.** Report `(ec, 0)` and discard the `k` bytes that were
+  transferred. The caller's buffer contains data the stream will not
+  acknowledge.
+- **Internal buffering.** Copy the `k` bytes into an internal buffer
+  and replay them on the next call. This adds allocation and copying
+  overhead for a postcondition the caller does not need.
+
+Under E2, the implementation reports what happened: `(ec, k)`. No
+deferred state, no data loss, no internal buffering.
+
+### Symmetry with write_some
+
+The analysis applies symmetrically to `write_some`. Consider a stream
+that successfully sends 500 bytes of a 1000-byte buffer before
+encountering a connection reset. Under E1, it must report `(ec, 0)` and
+the caller cannot know that 500 bytes reached the peer. Under E2, it
+reports `(ec, 500)` and the caller can account for the partial transfer
+in retry logic or error reporting.
+
+### The Partition Argument
+
+E1 produces a cleaner partition of the return space, where `n > 0`
+alone distinguishes success from error. This is a real property, but its
+practical value is limited: correct I/O code always checks `ec`. The
+partition is an aesthetic property of the specification, not a property
+that changes what correct call sites look like.
+
+### The Composed Operation Argument
+
+The strict postcondition on `write_some` does not propagate to composed
+operations. `write` (which transfers the full buffer) returns `(ec, m)`
+where `m < n` on failure - it must report partial progress alongside
+the error because there is no other way to tell the caller how many
+bytes were sent. The `(ec, n>0)` case that E1 eliminates from
+`write_some` is immediately reintroduced one layer up.
+
+This observation has a structural consequence: E1 reproduces the POSIX
+deferred-error model at the `_some` layer, then undoes it at the
+composed layer. POSIX defers errors across calls because its API returns
+a single `ssize_t`. The `(error_code, size_t)` return type was designed
+to carry both values simultaneously. E1 artificially constrains the
+primitive to not use this capability, then the first composed operation
+built on top of it must reintroduce partial-progress-with-error because
+the composition has no other way to report what happened. The strict
+guarantee is local to the primitive and dissolves upon composition.
+
+## Areas of Agreement
+
+1. **The advance-then-check pattern is mandatory.** Regardless of which
+   postcondition the concept chooses, the canonical loop advances
+   `total` before checking `ec`. This is a requirement of correct I/O
+   programming, not a consequence of the postcondition choice.
+
+2. **Zero-length buffers should not crash.** A precondition requiring
+   non-empty buffers repeats the `memcpy` mistake. Empty ranges are
+   legitimate in protocols with variable-length payloads.
+
+3. **Zero-length buffers are not an error.** The empty buffer itself
+   should not cause a dedicated error. `n` is 0. Whether `ec` is set
+   depends on the I/O state of the stream, not on the buffer size.
+
+4. **The concept constrains generic code.** Concrete types may refine
+   the concept's postconditions with stronger guarantees. The concept
+   specifies the floor, not the ceiling.
+
+5. **EOF is an error condition with partial data.** `read` returning
+   `(eof, 47)` is the canonical example of partial progress with an
+   error. Every I/O loop must handle this correctly, and the
+   advance-then-check pattern exists precisely for this reason.
+
+## Areas of Disagreement
+
+1. **Whether the concept should minimize implementer burden or maximize
+   return-value clarity.** E1 produces a cleaner partition of the return
+   space. E2 avoids deferred-error bookkeeping, data loss, and internal
+   buffering in implementations. The disagreement is over which cost is
+   more important.
+
+2. **Whether `n > 0` should be a success proxy.** Under E1, `n > 0`
+   implies `!ec`. Under E2, it does not. One view holds that this proxy
+   is a useful property for reasoning about return values. The other
+   holds that correct code always checks `ec`, and the proxy encourages
+   a shortcut that masks the error status.
+
+3. **Whether "least common denominator" favors weaker or stronger
+   postconditions.** One view holds that the concept should impose the
+   weakest requirements all streams can satisfy (E2). The other holds
+   that the concept should impose the strongest requirements that still
+   admit all *useful* streams, on the grounds that a concept exists to
+   enable generic programming, not to admit the widest set of types (E1).
+
+4. **Whether the meaning of `total` matters.** Under E1, `total`
+   after a loop represents bytes transferred without error. Under E2,
+   `total` represents bytes transferred, some of which may have been
+   accompanied by an error. One view holds that the former is more
+   meaningful. The other holds that both are equally useful, since the
+   bytes were transferred either way.
+
+## Summary
+
+| Property                         | E1 (n == 0 on error) | E2 (n >= 0 on error)  |
+| -------------------------------- | -------------------- | --------------------- |
+| Call-site code                   | Identical            | Identical             |
+| Return-value partition           | Clean                | Overlapping           |
+| Implementer burden               | Higher               | Lower                 |
+| Information preservation         | Lossy                | Lossless              |
+| POSIX alignment                  | Matches              | Transcends            |
+| Composed operations              | Reintroduce (ec,n>0) | Consistent throughout |
+| Concept philosophy               | Strongest useful     | Weakest possible      |
+| n > 0 implies success            | Yes                  | No                    |
+| Deferred-error bookkeeping       | Required for some    | Never required        |
+
+The core tension is between specification clarity (E1) and implementation
+honesty (E2). E1 matches POSIX, produces a clean return-value partition,
+and is the natural translation of the single-return-value POSIX model
+into `(error_code, size_t)`. E2 uses the richer return type to transcend
+the POSIX limitation, avoids forcing non-POSIX streams into a
+deferred-error model, and produces a postcondition that is consistent
+from `_some` primitives through composed operations. Since the canonical
+call-site code is identical under both rules, the choice reduces to
+whether the concept should reproduce the POSIX deferred-error model at
+the cost of implementation burden on layered streams, or whether it
+should permit the implementation to report exactly what happened at the
+cost of a less tidy specification.
diff --git a/doc/reference/concurrency-1.md b/doc/reference/concurrency-1.md
index dfaafa6c6..6a199bc69 100644
--- a/doc/reference/concurrency-1.md
+++ b/doc/reference/concurrency-1.md
@@ -12,7 +12,7 @@ This time, we'll be going through C++ multithreading and concurrency related stu
 
 - How **variables, loops, conditionals, etc**. work (Basic coding fundamentals will help a lot!)
 - Linux (**Terminal/Console proficiency**) (We're going to need to compile our stuff)
-- Gone through the all preceding parts of the tutorial
+- Gone through all preceding parts of the tutorial
 - Some familiarity with threading will help
 
 
@@ -52,7 +52,7 @@ This time, we'll be going through C++ multithreading and concurrency related stu
 
 [Image Source](<https://msl-network.readthedocs.io/en/latest/concurrency_async.html>)
 
-Everyone likes threading ja. Why not make such an efficient language such as C++ even more efficient with multi-threading.
+Everyone likes threading ja. Why not make an efficient language like C++ even more efficient with multi-threading.
 
 We're going to talk about the nice `std::thread` class that abstracts away the low level POSIX threads or p threads library in C. We'll also talk about `std::async` for asynchronous thread generation, as well as a bit on locks and atomic types.
 
@@ -188,7 +188,7 @@ std::thread ref_function_thread(ref_function, std::ref(val), 2);
 
 #### **A Note on Static Variables**
 
-Be wary of declaring static variables in a multiple threads though!
+Be wary of declaring static variables in multiple threads though!
 
 ```c++
 // Suppose this is your thread function
@@ -237,7 +237,7 @@ example_thread.join();
 
 ```c++
 // So you can check if a thread is joinable before calling the join method!
-if (exmaple_thread.joinable())
+if (example_thread.joinable())
 {
   example_thread.join(); 
 }
@@ -379,7 +379,7 @@ thread_function()
 
 It's actually better to just use a lock guard, which manages the lifecycle of a mutex for you.
 
-It's kind of like the `with:` operator in Python.
+It's kind of like the `with` statement in Python.
 
 **Notably, a lock guard releases the lock automatically once the function that it is called in goes out of scope!**
 
@@ -842,7 +842,7 @@ Or, more completely
 #include <thread>
 #include <future>
  
-void initiazer(std::promise<int> * promObj)
+void initializer(std::promise<int> * promObj)
 {
     std::cout<<"Inside Thread"<<std::endl;     promObj->set_value(35);
 }
@@ -851,7 +851,7 @@ int main()
 {
     std::promise<int> promiseObj;
     std::future<int> futureObj = promiseObj.get_future();
-    std::thread th(initiazer, &promiseObj);
+    std::thread th(initializer, &promiseObj);
     std::cout<<futureObj.get()<<std::endl;
     th.join();
     return 0;
@@ -936,1387 +936,3 @@ auto future = std::async(std::launch::async, [](){});
 ---
 
  [![Yeah! Buy the DRAGON a COFFEE!](../_assets/COFFEE%20BUTTON%20%E3%83%BE(%C2%B0%E2%88%87%C2%B0%5E).png)](https://www.buymeacoffee.com/methylDragon)
-
- 
-
-Author: methylDragon  
-Contains an advanced syntax reference for C++  
-This time, we'll be going through C++ multithreading and concurrency related stuff!    
-
-------
-
-## Pre-Requisites
-
-**Assumed knowledge (This is a C++ crash course, not a basic coding tutorial)**
-
-- How **variables, loops, conditionals, etc**. work (Basic coding fundamentals will help a lot!)
-- Linux (**Terminal/Console proficiency**) (We're going to need to compile our stuff)
-- Gone through the all preceding parts of the tutorial
-- Some familiarity with threading will help
-
-
-
-## Table Of Contents <a name="top"></a>
-
-1. [Introduction](#1)    
-2. [C++ Threading Reference](#2)    
-   2.1 [Threads](#2.1)    
-   2.2 [Creating Threads](#2.2)    
-   2.3 [Thread Specific Functions](#2.3)    
-   2.4 [Sharing Data](#2.4)    
-   2.5 [Waiting, Killing, and Detaching](#2.5)    
-   2.6 [Race Conditions](#2.6)    
-   2.7 [Atomics](#2.7)    
-   2.8 [Mutex and Locks](#2.8)    
-   2.9 [A Better Way: Lock Guards](#2.9)    
-   2.10 [Lock Guard Types](#2.10)    
-   2.11 [Exclusive Locks vs Shared Locks](#2.11)    
-   2.12 [Mutex Types](#2.12)    
-   2.13 [Event Handling: Condition Variables](#2.13)    
-3. [C++ Concurrency Reference](#3)    
-   3.1 [Introduction](#3.1)    
-   3.2 [When to Use Threads or Tasks](#3.2)    
-   3.3 [Promises and Futures](#3.3)    
-   3.4 [A Simple Promise-Future Example](#3.4)    
-   3.5 [Async](#3.5)    
-   3.6 [Async Launch Policies](#3.6)    
-   3.7 [Different Ways to Call Async](#3.7)    
-
-
-
-
-## 1. Introduction <a name="1"></a>
-
-![_images/concurrency_vs_parallelism.png](assets/concurrency_vs_parallelism-1562918749730.png)
-
-[Image Source](<https://msl-network.readthedocs.io/en/latest/concurrency_async.html>)
-
-Everyone likes threading ja. Why not make such an efficient language such as C++ even more efficient with multi-threading.
-
-We're going to talk about the nice `std::thread` class that abstracts away the low level POSIX threads or p threads library in C. We'll also talk about `std::async` for asynchronous thread generation, as well as a bit on locks and atomic types.
-
-
-
-## 2. C++ Threading Reference <a name="2"></a>
-
-### 2.1 Threads <a name="2.1"></a>
-[go to top](#top)
-
-
-![img](assets/threads-as-control-flow.png)
-
-[Image Source](<https://kholdstare.github.io/technical/2012/08/21/objects-and-threads-in-cpp-1.html>)
-
-You can use the [std::thread](<http://www.cplusplus.com/reference/thread/thread/>) class to start threads. Each instance of this thread represents and wraps and manages a single execution thread.
-
-![_images/concurrency_vs_parallelism.png](assets/concurrency_vs_parallelism-1562918749730.png)
-
-[Image Source](<https://msl-network.readthedocs.io/en/latest/concurrency_async.html>)
-
-Threads will run **concurrently** if they're on the same processor. But ***in parallel*** if they're on different processors!
-
-Each thread has its own call stack, but **all threads share the heap.**
-
-You can find the maximum number of active threads that you can start. If your number of active threads exceeds this number you won't really get more performance out of it, so take note!
-
-```c++
-#include <thread>
-
-unsigned int c = std::thread::hardware_concurrency();
-```
-
-
-
-### 2.2 Creating Threads <a name="2.2"></a>
-[go to top](#top)
-
-
-There are several ways to create a thread:
-
-- Using a **function pointer**
-- Using a **lambda function**
-- Using a **functor**
-
-**Function Pointer**
-
-```c++
-#include <thread>
-
-// Define a function and start a thread that runs that function
-void rawr(params) {}
-std::thread rawr_thread(rawr, params);
-```
-**Lambda Function**
-```c++
-// Define a lambda expression and start a thread that runs that lambda expression
-auto rar = [](params) {};
-std::thread rar_thread(rar, params);
-
-// Or pass the lambda directly!
-std::thread rar_thread([](params) {};, params);
-```
-**Functor**
-```c++
-// Define a functor and start a thread that runs the functor's function call
-class raa_object_class {
-  void operator()(params) {}
-}
-
-std::thread raa_thread(raa_class_object(), params);
-```
-
-> Don't create threads on the heap with the new operator! Do it automatically on the stack for efficiency like in the examples stated above.
-
-
-
-### 2.3 Thread Specific Functions <a name="2.3"></a>
-[go to top](#top)
-
-
-Use `std::this_thread` within threads to refer to the current thread!
-
-**Note that yield() is NOT like the Python yield! It's completely different behaviour.**
-
-```c++
-#include <thread>
-#include <chrono>
-
-// These can be used within a thread
-
-// Get thread ID of thread
-std::this_thread::get_id();
-
-// Give priority to other threads, pause execution
-std::this_thread::yield();
-
-// Sleep for some amount of time
-std::this_thread::sleep_for(std::chrono::seconds(1));
-
-// Sleep until some time
-std::chrono::system_clock::time_point time_point = std::chrono::system_clock::now()
-                                                   + std::chrono::seconds(10);
-std::this_thread::sleep_until(time_point);
-```
-
-
-
-### 2.4 Sharing Data <a name="2.4"></a>
-[go to top](#top)
-
-
-**Global Variables**
-
-All global and static variables that are initialised at compile time can be accessed by threads. Since the threads should know the addresses for them.
-
-#### **Passing By Reference**
-
-All parameters passed to a function when starting a thread are **passed by value**, even if you defined in the function to pass by reference!
-
-You need to **explicitly wrap the arguments in std::ref() to pass by reference.**
-
-Example:
-
-```c++
-void ref_function(int &a, int b) {}
-
-int val;
-std::thread ref_function_thread(ref_function, std::ref(val), 2);
-```
-
-**Because the thread functions can't return anything, passing by reference is the only way to properly get data out of a thread without using global variables.** Ensure that your thread modifies the data passed in by reference and you should be good to go.
-
-#### **A Note on Static Variables**
-
-Be wary of declaring static variables in a multiple threads though!
-
-```c++
-// Suppose this is your thread function
-void method()
-{
-  static int var = 0;
-  var++;
-}
-```
-
-**Note that this does NOT create a separate instance of the static variable per thread instance.** This is because static variables are initialised once when the compiler goes over their declaration.
-
-If you want to have 'static' variables that are static within the scope of each particular thread, use `thread_local` variables instead. Then each thread will have its own version of the static variable, and the static variable will only be destroyed on thread exit.
-
-```c++
-void method()
-{
-  thread_local int var = 0;
-  var++;
-}
-```
-
-
-
-### 2.5 Waiting, Killing, and Detaching <a name="2.5"></a>
-[go to top](#top)
-
-
-#### **Waiting to Complete**
-
-You use the `join()` method to wait for a thread to complete.
-
-Calling `join()` will **block the main thread** until the thread that is being waited for completes.
-
-```c++
-// Start thread example_thread
-std::thread example_thread(some_function); 
-
-// Block and wait for thread to finish
-example_thread.join();
-
-// Ok! We're done and good to go on doing other stuff ...
-```
-
-**You cannot join a thread if it is not joinable** (maybe you killed it already, or it was detached.)
-
-```c++
-// So you can check if a thread is joinable before calling the join method!
-if (exmaple_thread.joinable())
-{
-  example_thread.join(); 
-}
-```
-
-#### **Kill a Thread**
-
-Use `return`, **not** `std::terminate()`! `terminate()` will kill your entire program process, not an individual thread. 
-
-```c++
-return;
-```
-
-#### **Detaching a Thread**
-
-You may `detach` a thread. That is, split it from the `std::thread()` object that manages it. Once you do that, you won't be able to manage the thread aside from any mutex or shared resources between the different threads.
-
-Those detached threads will only exit when the main process is terminated or when the top level function exits.
-
-```c++
-example_thread.detach();
-```
-
-
-
-### 2.6 Race Conditions <a name="2.6"></a>
-[go to top](#top)
-
-
-![SharedMutable](assets/SharedMutable.png)
-
-[Image Source](<https://www.modernescpp.com/index.php/c-core-guidelines-rules-for-concurrency-and-parallelism>)
-
-It's always thread-safe if you're only reading variables from multiple threads. But the moment you start writing data from multiple threads, you can potentially crash or create unexpected behaviour.
-
-**Example**
-
-```c++
-// Source: https://stackoverflow.com/questions/34510/what-is-a-race-condition
-
-if (x == 5) // The "Check"
-{
-   y = x * 2; // The "Act"
-
-   // If another thread changed x in between "if (x == 5)" and "y = x * 2" above,
-   // y will not be equal to 10.
-}
-```
-
-
-
-### 2.7 Atomics <a name="2.7"></a>
-[go to top](#top)
-
-
-So there are several ways to prevent race conditions. An `std::atomic` is just one way.
-
-An atomic type is mainly a type that implements atomic operations. That is, operations that are thread safe and run independently of any other processes. There can be some overhead, especially when there is a lot of contention around them, but it's hard to get into details for how much overhead exactly, since it's platform and context specific.
-
-Using an atomic type **guarantees no race conditions will occur.** 
-
-> **Use atomic types only when you need them, and native types when you don't. If you care about performance, that is.**
-
-You can check the [Atomic Types Reference](<https://en.cppreference.com/w/cpp/atomic/atomic>) for the full list of how to instantiate them, but here's a couple of examples.
-
-**There's a gigantic list! This table is non-exhaustive:**
-
-|    Type Alias    | Type Instantiation  |
-| :--------------: | :-----------------: |
-| std::atomic_bool | `std::atomic<bool>` |
-| std::atomic_char | `std::atomic<char>` |
-| std::atomic_int  | `std::atomic<int>`  |
-| std::atomic_long | `std::atomic<long>` |
-|        .         |          .          |
-|        .         |          .          |
-|        .         |          .          |
-
-
-
-### 2.8 Mutex and Locks <a name="2.8"></a>
-[go to top](#top)
-
-
-#### **Introduction**
-
-We'll go through this for completeness' sake, but there is a better way to do things (lock guards.)
-
-**Mutexes** are mutual exclusion objects that are used for thread synchronisation. They're a way to keep track of whether a particular thread is using a resource, and will cause threads to block if the resource is currently being taken. It's a way to **protect shared resources and to prevent race conditions.**
-
-They are **owned** by the thread that takes it. Hence, **mutual exclusion!**
-
-This will slow down your threaded program if threads wait too much, so use them sparingly! But you still need to use them to prevent race conditions and to really control the multi-threaded program flow of your program.
-
-They are the **interface** through which you can engage locks for your code!
-
-#### **Deadlocks**
-
-Of course, you need to be careful when you're using mutexes and locks. Overuse of locks will slow down your code, or in certain cases, cause deadlocks, causing your program to completely stall.
-
-![Image result for deadlock](assets/deadlock.png)
-
-[Image Source](<https://www.geeksforgeeks.org/operating-system-process-management-deadlock-introduction/>)
-
-> **Methods for handling deadlock**
->
-> 1) **Deadlock prevention or avoidance**: The idea is to not let the system into deadlock state.
-> One can zoom into each category individually, Prevention is done by negating one of above mentioned necessary conditions for deadlock.
->
-> 2) **Deadlock detection and recovery**: Let deadlock occur, then do preemption to handle it once occurred.
->
-> 3) **Ignore the problem all together**: If deadlock is very rare, then let it happen and reboot the system. This is the approach that both Windows and UNIX take.
->
-> <https://www.geeksforgeeks.org/operating-system-process-management-deadlock-introduction/>
-
-#### **Example Usage**
-
-> Note that this method is **not recommended**. It's actually an [**anti-pattern**](<http://kayari.org/cxx/antipatterns.html#locking-mutex>) but just included for completeness' sake.
-
-```c++
-#include <mutex>
-
-// Create your mutex here
-std::mutex my_mutex;
-
-// 
-thread_function()
-{
-  my_mutex.lock(); // Acquire lock
-  // Do some non-thread safe stuff...
-  my_mutex.unlock(); // Release lock
-}
-```
-
-
-
-### 2.9 A Better Way: Lock Guards <a name="2.9"></a>
-[go to top](#top)
-
-
-It's actually better to just use a lock guard, which manages the lifecycle of a mutex for you.
-
-It's kind of like the `with:` operator in Python.
-
-**Notably, a lock guard releases the lock automatically once the function that it is called in goes out of scope!**
-
-```c++
-#include <mutex>
-
-// Create your mutex here
-std::mutex my_mutex;
- 
-thread_function()
-{
-  std::lock_guard<std::mutex> guard(my_mutex); // Acquire lock
-  // Do some non-thread safe stuff...
-}
-```
-
-
-
-### 2.10 Lock Guard Types <a name="2.10"></a>
-[go to top](#top)
-
-
-So there are actually several lock guard types.
-
-You've already seen the standard lock_guard
-
-#### **std::lock_guard<>**
-
-[Reference](<https://en.cppreference.com/w/cpp/thread/lock_guard>)
-
-- Simplest lock guard
-- Takes a mutex on construction
-- Releases the mutex once it goes out of scope
-
-```c++
-std::lock_guard<std::mutex> guard(my_mutex);
-```
-
-#### **std::scoped_lock<>**
-
-[Reference](<https://en.cppreference.com/w/cpp/thread/scoped_lock>)
-
-This was introduced in C++17, and is the standard lock guard to use, over `std::lock_guard<>`, which is included for compatibility.
-
-- It's just a lock guard
-- Except it can take **multiple mutexes**
-
-```c++
-std::scoped_lock<std::mutex, std::mutex> guard(mutex_1, mutex_2);
-```
-
-#### **std::unique_lock<>**
-
-[Reference](<https://en.cppreference.com/w/cpp/thread/unique_lock>)
-
-- Just like the normal lock guard, except...
-- It initialises an exclusive lock
-- It can be returned from the function without releasing the lock (via move semantics)
-- It can be released before it is destroyed
-- You can also use **nifty lock methods!**
-
-```c++
-std::unique_lock<std::mutex> guard(my_mutex);
-
-// Check if guard owns lock (either works)
-guard.owns_lock();
-bool(guard);
-
-// Return function without releasing the lock
-return std::move(guard);
-
-// Release lock before destruction
-guard.unlock();
-```
-
-If you defer the locks, you can use the **nifty lock methods!**
-
-```c++
-// Initialise the lock guard, but don't actually lock yet
-std::unique_lock<std::mutex> guard(mutex_1, std::defer_lock);
-
-// Now you can do some of the following!
-guard.lock(); // Lock now!
-guard.try_lock(); // Won't block if it can't acquire
-guard.try_lock_for(); // Only for timed_mutexes
-guard.try_lock_until(); // Only for timed_mutexes
-```
-
-#### **std::shared_lock<>**
-
-[Reference](<https://en.cppreference.com/w/cpp/thread/shared_lock>)
-
-A shared lock is just like a unique lock, except the lock is a shared lock as opposed to an exclusive one.
-
-- Just like the normal lock guard, except...
-- It initialises a shared lock
-- It can be returned from the function without releasing the lock (via move semantics)
-- It can be released before it is destroyed
-- You can also use **nifty lock methods!**
-
-```c++
-std::shared_lock my_mutex;
-std::shared_lock<std::shared_mutex> guard(my_mutex);
-
-// Check if guard owns lock (either works)
-guard.owns_lock();
-bool(guard);
-
-// Return function without releasing the lock
-return std::move(guard);
-
-// Release lock before destruction
-guard.unlock();
-```
-
-If you defer the locks, you can use the **nifty lock methods!**
-
-```c++
-// Initialise the lock guard, but don't actually lock yet
-std::shared_lock<std::shared_mutex> guard(mutex_1, std::defer_lock);
-
-// Now you can do some of the following!
-guard.lock(); // Lock now!
-guard.try_lock(); // Won't block if it can't acquire
-guard.try_lock_for(); // Only for timed_mutexes
-guard.try_lock_until(); // Only for timed_mutexes
-```
-
-
-
-### 2.11 Exclusive Locks vs Shared Locks <a name="2.11"></a>
-[go to top](#top)
-
-
-**Exclusive locks** (aka write locks) **inhibit all access** from other threads until the lock is released.
-
-**Shared locks** (aka read locks) **inhibit all writes** from other threads until the lock is released. Other threads have to request the lock to be granted the permission to read though.
-
-> Exclusive lock mode prevents the associated resource from being shared. This lock mode is obtained to modify data. The first transaction to lock a resource exclusively is the only transaction that can alter the resource until the exclusive lock is released.
->
-> Share lock mode allows the associated resource to be shared, depending on the operations involved. Multiple users reading data can share the data, holding share locks to prevent concurrent access by a writer (who needs an exclusive lock). Several transactions can acquire share locks on the same resource.
->
-> ---
->
-> Think of a lockable object as a *blackboard* (lockable) in a class room containing a *teacher* (writer) and many *students* (readers).
->
-> While a teacher is writing something (exclusive lock) on the board:
->
-> 1. Nobody can read it, because it's still being written, and she's blocking your view => ***If an object is exclusively locked, shared locks cannot be obtained*.**
-> 2. Other teachers won't come up and start writing either, or the board becomes unreadable, and confuses students => ***If an object is exclusively locked, other exclusive locks cannot be obtained*.**
->
-> When the students are reading (shared locks) what is on the board:
->
-> 1. They all can read what is on it, together => *Multiple shared locks can co-exist*.
-> 2. The teacher waits for them to finish reading before she clears the board to write more => *If one or more shared locks already exist, exclusive locks cannot be obtained*.
->
-> <https://stackoverflow.com/questions/11837428/whats-the-difference-between-an-exclusive-lock-and-a-shared-lock>
-
-Notice this means that **if an object is shared locked, you can acquire shared locks, but not exclusive locks.**
-
-Basically: 
-
-- If there are multiple readers, no writers can bind, but readers can bind.
-- If there is one writer, no one can bind.
-
-
-
-
-### 2.12 Mutex Types <a name="2.12"></a>
-[go to top](#top)
-
-
-There are [several](<https://en.cppreference.com/w/cpp/thread/mutex>) [types](<https://en.cppreference.com/w/cpp/thread/recursive_mutex>) [of](<https://en.cppreference.com/w/cpp/thread/timed_mutex>) [mutex](<https://en.cppreference.com/w/cpp/thread/recursive_timed_mutex>).
-
-#### **std::mutex**
-
-[Reference](<https://en.cppreference.com/w/cpp/thread/mutex>)
-
-- Just your plain lockable mutex
-
-#### **std::timed_mutex**
-
-[Reference](<https://en.cppreference.com/w/cpp/thread/timed_mutex>)
-
-- Timed mutex
-- You can lock for a specified amount of time with `try_lock_for()` and `try_lock_until()`
-
-#### **std::recursive_mutex**
-
-[Reference](<https://en.cppreference.com/w/cpp/thread/recursive_mutex>)
-
-- Multiple locks can be acquired by the same thread
-- You need to call unlock the same amount of times you've called lock before the lock is released
-
-#### **std::recursive_timed_mutex**
-
-[Reference](<https://en.cppreference.com/w/cpp/thread/recursive_timed_mutex>)
-
-- Same as the recursive mutex, except it also has the timed locking methods that timed mutexes have
-
-#### **std::shared_timed_mutex**
-
-[Reference](<https://en.cppreference.com/w/cpp/thread/shared_timed_mutex>)
-
-- Read-Write mutex
-- Can acquire both exclusive or shared locks (just use the appropriate lock guard type!)
-
-```c++
-std::unique_lock<std::shared_timed_mutex> writer_guard(writing_mutex, std::defer_lock);
-std::shared_lock<std::shared_timed_mutex> reader_guard(reading_mutex, std::defer_lock);
-
-// Lock them!
-std::lock(writer_guard, reader_guard);
-```
-
-
-
-### 2.13 Event Handling: Condition Variables <a name="2.13"></a>
-[go to top](#top)
-
-
-Sometimes you need to do some nice signal/event handling.
-
-It's possible to do it using a global variable that you constantly lock threads for to check, but it's far more efficient to use **[condition variables](<https://en.cppreference.com/w/cpp/thread/condition_variable>)**.
-
-A condition variable allows you to **wait for some condition to be true** before continuing thread execution. During this time, any locks that were passed to the waiting function are released until the condition is fulfilled. Following which, the lock is reacquired.
-
-> **Example Flow**
->
-> 1. Thread **acquires lock**
-> 2. Check if condition is false
-> 3. If false, call `wait()`, which **releases the lock and blocks the thread until the condition is fulfilled**
-> 4. If a condition is fulfilled, the condition variable **must be notified** before it can check
-> 5. Once the condition check succeeds, **thread reacquires lock and continues execution**
-
-Let's try it out!
-
-Condition variables use unique_locks, so we'll use that.
-
-#### **Basic Example**
-
-```c++
-#include <condition_variable>
-
-// Init
-std::condition_variable condition_var;
-std::mutex mutex;
-bool condition(false);
-
-// Acquire lock
-std::unique_lock<std::mutex> guard(mutex);
-
-// Avoid spurious wakeups and 
-// ensure wait is only called when the condition has not been fulfilled
-while (!condition)
-{
-  condition_var.wait(guard);
-}
-
-// Now in some other thread
-{
-  // Acquire lock
-  std::unique_lock<std::mutex> guard(mutex);
-
-  // We can set the condition to true
-  condition = true;
-
-  // And notify one blocked thread by the condition variable that it's ok to wake up
-  // (In this case we only have one)
-  condition_var.notify_one();
-
-  // If we want to notify all of them instead...
-  condition_var.notify_all();
-    
-  // If we didn't surround the threads with the while (!condition) loop,
-  // Notifying the threads will cause the wait to return. So there's no condition check.
-  // But this is dangerous since random wakeups can occur without notifications!
-}
-```
-
-**You may also choose to make the condition be an atomic boolean instead so you can save on lock acquisition for any thread that sets the condition.**
-
-Like so: `std::atomic<bool> condition(true);`
-
-#### **Additional Methods**
-
-```c++
-// Wait for some time or until some time is reached
-condition_var.wait_for();
-condition_var.wait_until();
-
-// There's also a nice function to cleanup any condition variables by a lock acquiring thread
-// It's an equivalent call to 
-// First: destroying all objects that are meant to destroy on thread exit
-// Then: mutex.unlock(); condition_var.notify_all();
-std::notify_all_at_thread_exit(condition_var, some_unique_lock);
-```
-
-#### **Spurious Wakeups**
-
-A bit tricky. But sometimes condition variables can wakeup on their own due to some [threading technomagic](<https://stackoverflow.com/questions/8594591/why-does-pthread-cond-wait-have-spurious-wakeups>).
-
-It's relatively trivial to guard against it, and it's another layer of protection against human error, so it makes sense to at least try to deal with them explicitly.
-
-```c++
-// You guard against spurious wakeups by surrounding the condition variable
-// with a check for the condition (you're checking the predicate)
-while (!condition)
-{
-  condition_var.wait(guard);
-}
-
-// Alternatively, you can do it this way as well,
-// which is neater but slightly less intuitive
-condition_var.wait(guard, condition_function);
-
-// If we want to just check a bool called condition we need to use lambdas
-condition_var.wait(guard, [](){return condition == true;});
-```
-
-
-
-## 3. C++ Concurrency Reference <a name="3"></a>
-
-### 3.1 Introduction <a name="3.1"></a>
-[go to top](#top)
-
-
-We just went through manual thread handling in the previous section.
-
-But if you're lazy, or you don't need the tight control the thread, mutex, and lock guard classes offer you, you may choose to adopt **task based parallelism** instead, as opposed to **thread based parallelism**. It's generally considered faster to work with tasks as opposed to threads, especially since the chance of tasks messing up is far lower than that of threads.
-
-With the `std::async` library, manual thread handling is **abstracted away**, and you rely on the library's system to possibly spawn threads, depending on available resources. **The main benefit of this form of parallelism is the great ease in getting returned values from tasks that you start.**
-
-Before, when using threads, you'd have to pass variables via reference and have threads modify the variable. But now with tasks, you can just directly return the result of the task!
-
-So instead of thinking of starting the threads yourself, you can only be concerned with starting **tasks** that will return when they are supposed to. If tasks haven't returned yet, the code will block until it does.
-
-
-
-### 3.2 When to Use Threads or Tasks <a name="3.2"></a>
-[go to top](#top)
-
-
-Use **threads** if:
-
-- You need tight control over mutexes
-- Need to run long-lived, complex tasks
-
-Use **tasks** if:
-
-- You want fairly simple code and don't care for managing threads
-- Are running short tasks
-
-
-
-### 3.3 Promises and Futures <a name="3.3"></a>
-[go to top](#top)
-
-
-![1562934941151](assets/1562934941151.png)
-
-[Image Source](<https://modoocode.com/284>)
-
-![1562935061068](assets/1562935061068.png)
-
-[Image Source](<https://www.slideshare.net/cppfrug/async-await-in-c>)
-
-#### **Header**
-
-```c++
-#include <future>
-```
-
-#### **Futures**
-
-A [std::future](<https://en.cppreference.com/w/cpp/thread/future>) is a class template that stores a value that will be assigned in the future, and provides a way to access that value (with `get()`). If its value is accessed before the value is assigned, it will block until the value resolves.
-
-Futures are the objects that are **returned** by asynchronous operations (from `std::async`, `std::packaged_task`, or `std::promise`).
-
-**Shared Futures**
-
-A [std::shared_future](<https://en.cppreference.com/w/cpp/thread/shared_future>) works the same way, except it is copyable. Which means that multiple threads are allowed to wait for the same shared state.
-
-#### **Promises**
-
-A [std::promise](<https://en.cppreference.com/w/cpp/thread/promise>) provides a facility to store a value that is later acquired asynchronously via the future **that the promise creates**.
-
-Every promise **is associated with a future**! And a promise **sets** the value of that future. Other objects can then access the future for the value that the promise stores.
-
-#### **A dumb analogy**
-
-> **Today is a Gift. That is why it is called Present.**
->
-> You're a parent trying to get a gift for your child.
->
-> You give your kid a box, and **promise** them that the gift is inside. The gift is the **future** you are promising. But you tell them to only to check in the future.
->
-> If your kid tries to check, you panic, take the box away and, **block** them from checking, until you **fulfill your promise and fill the box** with the gift, then you can give it back and your kid can continue his day having gotten their gift.
-
-#### **A slightly better analogy**
-
-> **Food Analogy**
->
-> Let's say you're an office worker. You make an order for lunch from a store across the street via your phone app.
->
-> The store owner receives your order, and by the powers of the social contract, makes a **promise** to fulfill your order. He issues you a receipt that is associated with this **promise**, guaranteeing you that you will be able to collect your order in the **future** if he ever fulfills his promise.
->
-> You **block** off some time, stop your work at the office, and head down to the store.
->
-> But OH NO! The store owner hasn't fulfilled your order yet. And as long as you're waiting to **get()** your order, you can't do any work. Some might even say your **waiting to get your order in the future is blocking your ability to work.**
->
-> Once the store owner **sets()** your order down, and lets you **get()** it from his counter though, you're able to **stop getting blocked** and go back to the office to work.
-
-![mindblow](assets/mindblow.gif)
-
-
-
-### 3.4 A Simple Promise-Future Example <a name="3.4"></a>
-[go to top](#top)
-
-
-![std::promise and std::future](assets/promise.png)
-
-[Image Source](<https://thispointer.com//c11-multithreading-part-8-stdfuture-stdpromise-and-returning-values-from-thread/>)
-
-**Note:** If your promise object is destroyed before you set its value, the `get()` method for its associated future will throw an exception.
-
-**Also note:** Each future's `get()` method can only be called once. If you want a future that can be accessed multiple times, use a shared_future instead. Otherwise, **initialise a different promise future pair.**
-
-```c++
-// Create a promise
-std::promise<int> promise;
-
-// And get its future
-std::future<int> future = promise.get_future();
-
-// You can also get a shared future this way, by the way! (Choose one please)
-std::shared_future<int> shared_future = promise.get_future();
-
-// Now suppose we passed promise to a separate thread.
-// And in the main thread we call...
-int val = future.get(); // This will block!
-
-// Until, that is, we set the future's value via the promise
-promise.set_value(10); // In the separate thread
-
-// So now in the main thread, if we try to access val...
-std::cout << val << std::endl;
-
-// Output: 10
-```
-
-Or, more completely
-
-```c++
-// Source: https://thispointer.com//c11-multithreading-part-8-stdfuture-stdpromise-and-returning-values-from-thread/
-
-#include <iostream>
-#include <thread>
-#include <future>
- 
-void initiazer(std::promise<int> * promObj)
-{
-    std::cout<<"Inside Thread"<<std::endl;     promObj->set_value(35);
-}
- 
-int main()
-{
-    std::promise<int> promiseObj;
-    std::future<int> futureObj = promiseObj.get_future();
-    std::thread th(initiazer, &promiseObj);
-    std::cout<<futureObj.get()<<std::endl;
-    th.join();
-    return 0;
-}
-```
-
-
-
-### 3.5 Async <a name="3.5"></a>
-[go to top](#top)
-
-
-[std::async](<https://en.cppreference.com/w/cpp/thread/async>)
-
-Now that we've talked about futures and promises we can finally actually get to the real asynchronous coding library.
-
-Async is a function template allows you to spawn threads to do work, then collect the results from them via the **future** mechanism. In fact, calls to `std::async` return a `std::future` object!
-
-**Do note that async does support parallelism, just that the default constructor manages threads for you and may possibly not run the passed functions in a thread. You'll have to explicitly tell it to run the function in a new thread.**
-
-Also, since Linux threads run sequentially by default, it's especially important to force the functions to run in separate threads. We'll see how to do that later.
-
-The simplest call to async is to just pass in a callback function as an argument, and let the system handle it for you.
-
-```c++
-auto future = std::async(some_function, arg_1, arg_2);
-```
-
-
-
-### 3.6 Async Launch Policies <a name="3.6"></a>
-[go to top](#top)
-
-
-You can do better though!
-
-There are three ways to launch an async task:
-
-- `std::launch::async` : Guarantees launch in a separate thread
-- `std::launch::deferred`: Function will only be called on `get()`
-- `std::launch::async | std::launch::deferred`: Default behaviour. Defer to system.
-
-I like to run async tasks with the `std::launch::async` profile so I can have some semblance of control over the threads. Just **add it in as the first argument!**
-
-```c++
-auto future = std::async(std::launch::async, some_function, arg_1, arg_2);
-```
-
-
-
-### 3.7 Different Ways to Call Async <a name="3.7"></a>
-[go to top](#top)
-
-
-```c++
-// Pass in function pointer
-auto future = std::async(std::launch::async, some_function, arg_1, arg_2);
-
-// Pass in function reference
-auto future = std::async(std::launch::async, &some_function, arg_1, arg_2);
-
-// Pass in function object
-struct SomeFunctionObject
-{
-	void operator() (int arg_1){}
-};
-auto future = std::async(std::launch::async, SomeFunctionObject(), arg_1);
-
-// Lambda function
-auto future = std::async(std::launch::async, [](){});
-```
-
-
-
-
-```
-                            .     .
-                         .  |\-^-/|  .    
-                        /| } O.=.O { |\     
-```
-
----
-
- [![Yeah! Buy the DRAGON a COFFEE!](../_assets/COFFEE%20BUTTON%20%E3%83%BE(%C2%B0%E2%88%87%C2%B0%5E).png)](https://www.buymeacoffee.com/methylDragon)
-
- C++ Concurrency in Action: Practical Multithreading
-by Anthony Williams
-Overview
-Table of Contents
-Errata
-Buy the book
-
-C++ Concurrency in Action (second edition, published 2019 by Manning Publications) is the definitive reference and guide to writing multithreaded code with Standard C++. It is suitable for all levels of C++ programmers, including those who have never previously written any multithreaded code. This book will show you how to write robust multithreaded applications in C++ while avoiding common pitfalls.
-
-It's not just the best current treatment of C++11's threading facilities ... it's likely to remain the best for some time to come.Scott Meyers
-This book should be on every C++ programmer's desk. It's clear, concise, and valuable.Rob Green, Bowling Green State University
-Overview
-Systems with multiple processors or processors with multiple cores are the norm these days; even many phones have multicore processors. To take advantage of these processor cores you need to use concurrency, either in the form of multiple processes or multiple threads.
-
-The C++17 standard provides extensive support for writing multithreaded code to take advantage of these multicore and multiprocessor systems. C++ Concurrency in Action explains how these facilities work, and how to use them to best effect.
-
-This book provides a tutorial covering the use of the library facilities introduced in the last three C++ standards. It covers everything from the basics such as std::thread, std::future and std::condition_variable, to an in-depth description of the new memory model and std::atomic classes for low level synchronization and the new C++17 parallel algorithms. In later chapters, the book then goes on to cover the design of multithreaded code, including lock-free data structures and thread pools. Finally, there is a chapter on testing and debugging multithreaded applications.
-
-It doesn't stop there though: the appendices include a brief overview of the some of the C++ language features either used by the multithreading facilties, or commonly used in conjunction with them, such as variadic templates, lambda functions and rvalue references, as well as a 150 page reference covering every class and function in the C++ Standard Thread Library. The book also covers the additional facilities from the Concurrency TS that aren't yet part of the main C++ standard.
-
-Additional material in the second edition
-In addition to all the material from the first edition, the second edition (published in 2019) includes full coverage of the library changes from C++14 and C++17:
-
-std::shared_mutex and std::shared_timed_mutex. These provide for multiple-reader/single-writer mutex locks.
-std::scoped_lock from C++17 for locking multiple mutexes together.
-Parallel overloads of many standard library algorithms include std::sort, std::for_each and std::transform_reduce.
-Plus, full coverage of the library extensions from the concurrency TS:
-
-std::experimental::latch to allow waiting for a set number of events to occur
-std::experimental::barrier and std::experimental::flex_barrier to synchronize groups of threads
-std::experimental::atomic_shared_ptr to allow atomic accesses to a single shared_ptr instance from multiple threads, as a better alternative that the std::atomic_load and std::atomic_store free functions.
-Extended futures that allow continuations, so additional functions can be scheduled for when a future is ready.
-std::experimental::when_all and std::experimental::when_any to allow waiting for either all of a set of futures to be ready, or the first of a set of futures to be ready.
-
-geeksforgeeks
-Search...
-Courses
-Tutorials
-Interview Prep
-
-Sign In
-C++ Tutorial
-Interview Questions
-Examples
-Quizzes
-Projects
-Cheatsheet
-OOP
-Exception Handling
-STL
-DSA C++
-search icon
-Sign In
-Multithreading in C++
-Last Updated : 3 Oct, 2025
-Multithreading is a technique where a program is divided into smaller units of execution called threads. Each thread runs independently but shares resources like memory, allowing tasks to be performed simultaneously. This helps improve performance by utilizing multiple CPU cores efficiently. Multithreading support was introduced in C++11 with the introduction of <thread> header file.
-
-Importance of Multithreading
-Leverages multiple CPU cores to execute tasks in parallel, reducing overall execution time.
-Keeps applications responsive by running background operations without blocking the main thread. For example, in a word document, one thread does auto-formatting along with the main thread.
-Makes it easier to handle large workloads or multiple simultaneous operations, such as in servers or real-time systems.
-Common Operations On Thread
-The <thread> header in C++ provides a simple and powerful interface for managing threads. Below are some of the most common operations performed on threads:
-
-Create a Thread
-The std::thread class represent the thread. Threading an instance of this class will create a thread with the given callable as its task.
-
-
-thread thread_name(callable);
-where,
-
-thread_name: It is object of thread class.
-callable: It is a callable object like function pointer, function object.
-Example:
-
-
-#include <bits/stdc++.h>
-using namespace std;
-
-// Function to be run by the thread
-void func() {
-    cout << "Hello from the thread!" << endl;
-}
-
-int main() {
-    
-    // Create a thread that runs 
-    // the function func
-    thread t(func);
-    
-    // Main thread waits for 't' to finish
-    t.join();  
-    cout << "Main thread finished.";
-    return 0;
-}
-
-Output
-
-Hello from the thread!
-Main thread finished.
-Explanation: In the above program we have created a thread t that prints "Hello from the thread!" and this thread is joined with the main thread so that the main thread waits for the completion of this thread and once the thread t is finished the main thread resumes its execution and prints " Main thread finished".
-
-Joining a Thread
-Before joining a thread it is preferred to check if the thread can be joined using the joinable() method. The joinable method checks whether the thread is in a valid state for those operations or not.
-
-
-thread_name.joinable()
-The joinable() method returns true if the thread is joinable else returns false.
-
-Joining a Thread: Joining two threads C++ blocks the current thread until the thread associated with the std::thread object finishes execution. To join two threads ini C++ we can use join() function. Which is called inside the bidy of the thread to which the specified thread is to be joined.
-
-
-thread_name.join(); 
-The thread.join function throws std::system_error if the thread is not joinable.
-
-Note: Joining two non-main threads is risky as it may lead to race condition or logic errors.
-
-Detaching a thread
-A joined thread can be detached from the calling thread using the detach() member function of the std::thread class. When a thread is detached, it runs independently in the background, and the other thread does not waits for it to finish.
-
-
-thread_name.detach();
-Getting Thread ID
-In Multithreading in C++ each thread has a unique ID which can be obtained by using the get_id() function.
-
-
-thread_name.get_id();
-The get_id() function returns an object representing the thread’s ID
-
-Example program using the above operations altogether.
-
-
-
-#include <iostream>
-#include <thread>
-#include <chrono>
-using namespace std; 
-​
-void task1() {
-    cout << "Thread 1 is running. ID: " << this_thread::get_id() << "\n";
-}
-​
-void task2() {
-    cout << "Thread 2 is running. ID: " << this_thread::get_id() << "\n";
-}
-​
-int main() {
-    thread t1(task1);
-    thread t2(task2);
-​
-    // Get thread IDs
-    cout << "t1 ID: " << t1.get_id() << "\n";
-    cout << "t2 ID: " << t2.get_id() << "\n";
-​
-    // Join t1 if joinable
-    if (t1.joinable()) {
-        t1.join();
-        cout << "t1 joined\n";
-    }
-​
-    // Detach t2
-    if (t2.joinable()) {
-        t2.detach();
-        cout << "t2 detached\n";
-    }
-​
-    cout << "Main thread sleeping for 1 second...\n";
-    this_thread::sleep_for(chrono::seconds(1));
-    cout << "Main thread awake.\n";
-​
-    return 0;
-}
-Output:
-
-t1 ID: 0x1234
-t2 ID: 0x5678
-Thread 1 is running. ID: 0x1234
-t1 joined
-Thread 2 is running. ID: 0x5678
-t2 detached
-Main thread sleeping for 1 second...
-Main thread awake.
-Callables in Multithreading
-A callable (such as a function, lambda, or function object) is passed to a thread. The callable is executed in parallel by the thread when it starts. like, thread t(func); creates a thread that runs the func function. We can also pass parameters along with callable, like this thread t(func, param1, param2);
-
-In C++, callable can be divided into 4 categories:
-
-Function
-Lambda Expression
-Function Object
-Non-Static or static Member Function
-Function Pointer
-A function can be a callable object to pass to the thread constructor for initializing a thread.
-
-
-
-
-#include <bits/stdc++.h>
-using namespace std;
-​
-// Function to be run 
-// by the thread
-void func(int n) {
-    cout << n;
-}
-​
-int main() {
-    
-    // Create a thread that runs 
-    // the function func
-    thread t(func, 4);
-    
-    // Wait for thread to finish
-    t.join();
-    return 0;
-}
-
-Output
-
-4
-Lambda Expression
-Thread object can also use a lambda expression as a callable. Which can be passed directly inside the thread object.
-
-
-
-
-#include <iostream>
-#include <thread>
-​
-using namespace std;
-​
-int main() {
-    int n = 3;
-    
-    // Create a thread that runs 
-    // a lambda expression
-    thread t([](int n){
-        cout << n;
-    }, n);
-​
-    // Wait for the thread to complete
-    t.join();
-    return 0;
-}
-
-Output
-
-3
-Function Objects
-Function Objects or Functors can also be used for a thread as callable. To make functors callable, we need to overload the operator parentheses operator ().
-
-
-
-
-#include <iostream>
-#include <thread>
-using namespace std;
-​
-// Define a function object (functor)
-class SumFunctor {
-public:
-    int n;
-    SumFunctor(int a) : n(a) {}
-​
-    // Overload the operator() to 
-    // make it callable
-    void operator()() const {
-        cout << n;
-    }
-};
-​
-int main() {
-​
-    // Create a thread using 
-    // the functor object
-    thread t(SumFunctor(3));
-​
-    // Wait for the thread to 
-    // complete
-    t.join();
-    return 0;
-}
-
-Output
-
-3
-Non-Static and Static Member Function
-We can also use thread using the non-static or static member functions of a class. For non-static member function, we need to create an object of a class but it's not necessary with static member functions.
-
-
-
-
-#include <iostream>
-#include <thread>
-​
-using namespace std;
-​
-class MyClass {
-public:
-    // Non-static member function
-    void f1(int num) {
-        cout << num << endl;
-    }
-​
-    // Static member function that takes one parameter
-    static void f2(int num) {
-        cout << num;
-    }
-};
-​
-int main() {
-    
-    // Member functions 
-    // requires an object
-    MyClass obj;
-    
-    // Passing object and parameter
-    thread t1(&MyClass::f1, &obj, 3);
-    
-    t1.join(); 
-    
-    // Static member function can 
-    // be called without an object
-    thread t2(&MyClass::f2, 7);
-    
-    // Wait for the thread to finish
-    t2.join();  
-​
-    return 0;
-}
-
-Output
-
-3
-7
-Thread Management
-In C++ thread library, various functions are defined to manage threads that can be reused to perform multiple tasks. Some of the are listed below:
-
-Classes/Methods	Description
-join()	It ensures that the calling thread waits for the specified thread to complete its execution.
-detach()	Allows the thread to run independently of the main thread, meaning the main thread does not need to wait.
-mutex	A mutex is used to protect shared data between threads to prevent data races and ensure synchronization.
-lock_guard	A wrapper for mutexes that automatically locks and unlocks the mutex in a scoped block.
-condition_variable	Used to synchronize threads, allowing one thread to wait for a condition before proceeding.
-atomic	Manages shared variables between threads in a thread-safe manner without using locks.
-sleep_for()	Pauses the execution of the current thread for a specified duration.
-sleep_until()	Pauses the execution of the current thread until a specified time point is reached.
-hardware_concurrency()	Returns the number of hardware threads available for use, allowing you to optimize the use of system resources.
-get_id	Retrieves the unique ID of the current thread, useful for logging or debugging purposes.
-Problems with Multithreading
-Multithreading improves the performance and utilization of CPU, but it also introduces various problems:
-
-Deadlock
-Race Condition
-Starvation
-Deadlock
-A deadlock occurs when two or more threads are blocked forever because they are each waiting for shared resources that the other threads hold. This creates a cycle of waiting, and none of the threads can proceed.
-
-Race Condition
-A race condition occurs when two or more threads access shared resources at the same time, and at least one of them modifies the resource. Since the threads are competing to read and write the data, the final result depends on the order in which the threads execute, leading to unpredictable or incorrect results.
-
-Starvation
-Starvation occurs when a thread is continuously unable to access shared resources because other threads keep getting priority, preventing it from executing and making progress.
-
-Thread Synchronization
-In multithreading, synchronization is the way to control the access of multiple threads to shared resources, ensuring that only one thread can access a resource at a time to prevent data corruption or inconsistency. This is typically done using tools like mutexes, locks, and condition variables.
-
-Context switch in multithreading
-Context switch is a process in multithreading the process where the CPU stops the execution of one thread and begins executing another within the same process. In this process the CPU stores the state of the running thread so that it can be restored later once the CPU finishes the execution of the other thread.
-
-Comment
-S
-
-Sayan Mahapatra
-
-Follow
-
-137
-Article Tags:
-C++
-cpp-multithreading
-Explore
-C++ Basics
-Core Concepts
-OOP in C++
-Standard Template Library(STL)
-Practice & Problems
-GeeksforGeeks
-location
-Corporate & Communications Address:
-A-143, 7th Floor, Sovereign Corporate Tower, Sector- 136, Noida, Uttar Pradesh (201305)
-location
-Registered Address:
-K 061, Tower K, Gulshan Vivante Apartment, Sector 137, Noida, Gautam Buddh Nagar, Uttar Pradesh, 201305
-GFG App on Play Store
-GFG App on App Store
-Company
-About Us
-Legal
-Privacy Policy
-Contact Us
-Advertise with us
-GFG Corporate Solution
-Campus Training Program
-Explore
-POTD
-Job-A-Thon
-Blogs
-Nation Skill Up
-Tutorials
-Programming Languages
-DSA
-Web Technology
-AI, ML & Data Science
-DevOps
-CS Core Subjects
-Interview Preparation
-Software and Tools
-Courses
-ML and Data Science
-DSA and Placements
-Web Development
-Programming Languages
-DevOps & Cloud
-GATE
-Trending Technologies
-Videos
-DSA
-Python
-Java
-C++
-Web Development
-Data Science
-CS Subjects
-Preparation Corner
-Interview Corner
-Aptitude
-Puzzles
-GfG 160
-System Design
-@GeeksforGeeks, Sanchhaya Education Private Limited, All rights reserved
-
-
-Do Not Sell or Share My Personal Information
\ No newline at end of file
diff --git a/doc/strand-rationale.md b/doc/strand-rationale.md
new file mode 100644
index 000000000..37ebf082a
--- /dev/null
+++ b/doc/strand-rationale.md
@@ -0,0 +1,146 @@
+# Strand: Why Per-Strand Implementation
+
+A strand has two reasonable internal designs. The simpler one pools
+serialization state across strands; the correct one allocates state
+per-strand. Capy uses the per-strand design. This document explains why
+the simpler design is wrong and what the per-strand design costs.
+
+## The previous design
+
+Capy's original strand service held a fixed array of `strand_impl`
+objects, 211 slots, allocated inline in the service and never freed
+individually. When a user constructed a new strand, the service
+incremented a counter and returned a pointer to `impls_[counter % 211]`.
+
+```cpp
+strand_impl impls_[211];
+std::size_t salt_;
+
+strand_impl* get_implementation()
+{
+    std::lock_guard lock(mutex_);
+    return &impls_[salt_++ % 211];
+}
+```
+
+This is pure round-robin: the 1st strand gets slot 0, the 212th strand
+gets slot 0 again. Two strands that map to the same slot share the same
+`strand_impl` object.
+
+Each `strand_impl` holds:
+
+- a mutex (`mutex_`)
+- a pending operation queue (`pending_`)
+- a locked flag (`locked_`)
+- the executor identity used by whichever invoker is currently
+  dispatching
+
+Two strands that share a slot share all of this.
+
+## What sharing actually shares
+
+Sharing a mutex is not inherently a problem. Two strands that hold the
+same mutex contend on push and pop operations, which are brief. They
+still proceed independently afterward.
+
+Sharing a queue and a locked flag is a different matter. Those are the
+state machine that determines which work runs, in what order, and
+through which executor. When two logically independent strands share
+this state, the following become possible:
+
+**Cross-strand blocking.** Strand A is mid-dispatch, so `locked_` is
+true. Strand B posts a new operation. B's post sees `locked_` already
+set and adds its work to the shared queue without posting a new
+invoker. B's work now waits behind A's entire dispatch cycle, even
+though A and B are supposed to be independent.
+
+**Wrong executor dispatch.** The invoker that won the unlocked-to-locked
+transition captures the executor of the strand that triggered it. Call
+this strand A. If strand B later enqueues work into the shared state,
+that work runs through A's executor, not B's. For strands that wrap
+the same underlying thread pool, this is invisible. For strands that
+wrap different executor layers (a metrics wrapper, a type-erased
+`any_executor`, a test shim), operations execute through the wrong
+executor, violating the invariants the user associated with B's
+executor.
+
+**False equality.** `operator==` on two distinct strands returns true
+when they map to the same slot, because equality is defined as pointer
+identity of the impl.
+
+## Why per-strand is the right choice
+
+The correctness argument is simple: strand isolation is part of the
+contract. The word "strand" implies a serialization domain that is
+independent of all other strands. A user who writes code against two
+strands is justified in expecting that progress on one does not depend
+on progress on the other, and that work posted to one runs through
+that strand's executor, not a neighbor's.
+
+The pooled design cannot provide this guarantee for more than 211
+strands from the same context.
+
+One possible response is randomization: instead of pure round-robin,
+use a hash of the strand's address mixed with a salt counter. This
+spreads collisions across time so that (0, 211), (1, 212) are no longer
+the deterministic collision pairs. It does not remove collisions. With
+1000 strands from one context, roughly five collision pairs exist
+somewhere in the set. The bug surface is narrower and harder to trigger
+reproducibly, but the class of bug is identical.
+
+Randomization fixed a performance symptom (deterministic starvation)
+without fixing the correctness problem (shared state between independent
+strands). Treating these as the same fix is a category error.
+
+The per-strand design removes the impl pool entirely. Each strand
+allocates its own `strand_impl` via `make_shared`. Two strands never
+share a queue, a locked flag, or an invoker. Isolation is unconditional.
+
+The mutex pool stays. 193 mutexes for any number of strands is a real
+saving over allocating a mutex per strand. Unlike the impl pool, mutex
+sharing has no semantic consequence: the critical sections guarded by
+the mutex cover only push/pop and the locked flag check. Two strands
+that briefly contend on a shared mutex wait for each other's push/pop
+then proceed independently. No state crosses the boundary.
+
+The key insight is that isolation and contention are not the same
+problem. The impl pool conflated them. Removing the impl pool eliminates
+the isolation problem; keeping the mutex pool manages the contention
+cost without reintroducing the isolation problem.
+
+## What the per-strand design costs
+
+**One allocation per strand.** `make_shared<strand_impl>` allocates
+roughly 80-96 bytes on typical allocators with per-thread arenas
+(glibc, jemalloc, tcmalloc). For any strand that posts at least one
+operation, this is negligible against the work being dispatched.
+
+**One pointer of additional size per strand handle.** The strand object
+holds a `shared_ptr<strand_impl>` rather than a raw pointer. A
+`shared_ptr` is two pointers wide; a raw pointer is one. Strand objects
+grow by one pointer (typically 8 bytes).
+
+**Two atomic refcount operations per invoker creation/destruction.** The
+invoker coroutine frame holds a copy of the `shared_ptr`, so the
+reference count increments when the invoker starts and decrements when
+it finishes. These are not on the hot post path; they happen at the
+unlocked-to-locked transition (once per dispatch batch), not on every
+enqueue.
+
+The mutex pool bounds memory growth at 193 mutexes regardless of how
+many strands exist. A program that creates 10,000 strands does not get
+10,000 mutexes; it gets at most 193.
+
+## Tradeoffs we did not take
+
+**Per-strand mutex.** Allocating a mutex per strand would eliminate the
+mutex pool entirely and remove all cross-strand contention. The cost is
+roughly 40 extra bytes per strand. The benefit is marginal: the
+critical sections that use the pool mutex are brief, and contention
+between unrelated strands is unlikely in practice. This option remains
+open if benchmarks show real contention under specific workloads.
+
+The chosen design (per-strand impl, shared mutex pool) matches the
+strategy used by current executor-aware strand implementations in the
+C++ library space, which provides confidence that the tradeoffs are
+well understood.
diff --git a/doc/strand-spec.md b/doc/strand-spec.md
new file mode 100644
index 000000000..6b48d532e
--- /dev/null
+++ b/doc/strand-spec.md
@@ -0,0 +1,303 @@
+# Strand
+
+Each strand allocates a private serialization state via
+`shared_ptr<strand_impl>`. Strands sharing an execution context borrow
+mutexes from a 193-entry pool but never share their queues, `locked_`
+flag, or invoker. This document is the design contract; see
+`strand-rationale.md` for the motivation.
+
+## Goals
+
+- Strand isolation is absolute, not probabilistic. Two distinct strands
+  never share a queue, a `locked_` flag, or a dispatcher executor.
+- Public API of `strand<Ex>` is unchanged: same operations, same
+  equality semantics, same `running_in_this_thread`.
+- Construction cost: one `std::make_shared` per strand.
+- Capy's existing performance optimizations are preserved: the
+  `strand_queue` per-post wrapper recycler stays per-impl; the invoker
+  coroutine frame cache moves to the service.
+
+## Non-goals
+
+- Per-strand mutex. The shared mutex pool stays. Revisit if benchmarks
+  show contention from shared mutexes.
+- Performance tuning of the mutex pool size or salt function.
+- Lock-free hot path. The per-impl mutex is taken under
+  `post`/`dispatch` for queue mutation and `locked_` flag check.
+- Allocator plumbing for `allocate_shared`. Default-construct now; can
+  add an allocator parameter later without changing this design.
+- Changes to `strand_queue`. Its free-list stays; lifetime is bounded by
+  the impl that owns it.
+
+## Design
+
+### Data structures
+
+```cpp
+struct strand_impl
+    : intrusive_list<strand_impl>::node
+{
+    std::mutex* mutex_ = nullptr;          // borrowed from service pool
+    strand_queue pending_;
+    bool locked_ = false;
+    std::atomic<std::thread::id> dispatch_thread_{};
+
+    std::atomic<strand_service_impl*> service_{nullptr};
+
+    ~strand_impl();
+};
+
+class strand_service_impl : public strand_service
+{
+    static constexpr std::size_t num_mutexes = 193;
+
+    std::mutex mutex_;
+    std::size_t salt_ = 0;
+    std::shared_ptr<std::mutex> mutexes_[num_mutexes];
+    intrusive_list<strand_impl> impl_list_;
+    std::atomic<void*> invoker_frame_cache_{nullptr};
+};
+
+template<typename Ex>
+class strand
+{
+    std::shared_ptr<detail::strand_impl> impl_;
+    Ex ex_;
+};
+```
+
+Key design choices:
+
+- Each strand owns its `strand_impl` via `shared_ptr` (no pooling of
+  impls).
+- `strand` holds `shared_ptr<strand_impl>` rather than a raw pointer
+  (size grows by one pointer).
+- The invoker frame cache lives on the service, not the impl. The cache
+  slot always points at a structure that lives for the execution
+  context's lifetime, removing the lifetime hazard that would otherwise
+  affect per-strand impls.
+- `strand_impl` holds a borrowed `mutex_` pointer, an intrusive list
+  base class (via `intrusive_list<strand_impl>::node`), and a
+  back-pointer to the service.
+- The service holds a 193-entry mutex pool, the head of the live-impl
+  linked list, and the invoker frame cache slot.
+- 193 is a prime large enough that hash collisions are rare in practice
+  while keeping the static mutex array small.
+
+### Public detail-header surface
+
+```cpp
+class BOOST_CAPY_DECL strand_service
+    : public execution_context::service
+{
+public:
+    virtual ~strand_service();
+
+    // Returns shared_ptr instead of raw pointer.
+    virtual std::shared_ptr<strand_impl>
+    create_implementation() = 0;
+
+    static bool
+    running_in_this_thread(strand_impl& impl) noexcept;
+
+    // Takes shared_ptr by const-ref so post_invoker can capture
+    // lifetime on the unlocked-to-locked transition without paying an
+    // atomic refcount on every post when the invoker is already running.
+    static std::coroutine_handle<>
+    dispatch(
+        std::shared_ptr<strand_impl> const& impl,
+        executor_ref ex,
+        std::coroutine_handle<> h);
+
+    static void
+    post(
+        std::shared_ptr<strand_impl> const& impl,
+        executor_ref ex,
+        std::coroutine_handle<> h);
+};
+```
+
+The strand constructor calls `create_implementation()` and stores the
+returned `shared_ptr`. Public-API surface of `strand<Ex>` does not
+change. The `running_in_this_thread` query is non-mutating and does
+not extend lifetime, so it stays as `strand_impl&`.
+
+### Construction
+
+```cpp
+std::shared_ptr<strand_impl>
+strand_service_impl::create_implementation()
+{
+    auto new_impl = std::make_shared<strand_impl>();
+
+    std::lock_guard lock(mutex_);
+
+    std::size_t s = salt_++;
+    std::size_t idx = reinterpret_cast<std::size_t>(new_impl.get());
+    idx += idx >> 3;
+    idx ^= s + 0x9e3779b9 + (idx << 6) + (idx >> 2);
+    idx %= num_mutexes;
+    if(!mutexes_[idx])
+        mutexes_[idx] = std::make_shared<std::mutex>();
+    new_impl->mutex_ = mutexes_[idx].get();
+
+    impl_list_.push_back(new_impl.get());
+    new_impl->service_ = this;
+
+    return new_impl;
+}
+```
+
+The hash mixes the impl's address with a monotonic salt and the golden
+ratio constant. The salt prevents deterministic collision sequences
+when the allocator returns predictable addresses; the address bits
+spread otherwise-correlated allocations. Mutex slots are allocated
+lazily: a program that creates few strands never instantiates all 193
+mutexes. The impl is appended to `impl_list_` via `push_back`; order
+does not matter since shutdown drains the entire list.
+
+### Dispatch / post
+
+State machine is unchanged from the previous design. The key
+differences:
+
+- `enqueue`, `dispatch_pending`, `try_unlock` operate on a strand's own
+  `pending_` and `locked_` (no cross-strand sharing).
+- The mutex they take is `*impl.mutex_`, which may be shared with other
+  impls that hashed to the same pool slot. Critical sections cover only
+  brief queue push/pop and the `locked_` flag check.
+- The static `post`/`dispatch` entry points take
+  `shared_ptr<strand_impl> const&`. When the unlocked-to-locked
+  transition wins, they copy the shared_ptr into `post_invoker`, which
+  passes it as the coroutine parameter held in the coroutine frame.
+  That keeps the impl alive for the duration of the dispatch cycle,
+  even if the user drops their last strand handle. When the transition
+  does not win (work is enqueued onto an already-running invoker), no
+  shared_ptr copy is made. The existing invoker's frame already holds
+  a reference. The hot path adds zero atomic refcount operations versus
+  the previous raw-pointer code.
+
+### Invoker frame allocation
+
+```cpp
+void* operator new(std::size_t n, strand_impl& impl)
+{
+    auto* svc = impl.service_;
+    constexpr auto A = alignof(strand_service_impl*);
+    std::size_t padded = (n + A - 1) & ~(A - 1);
+    std::size_t total = padded + sizeof(strand_service_impl*);
+
+    void* p = svc->invoker_frame_cache_.exchange(
+        nullptr, std::memory_order_acquire);
+    if(!p || p == kCacheClosed)
+        p = ::operator new(total);
+
+    *reinterpret_cast<strand_service_impl**>(
+        static_cast<char*>(p) + padded) = svc;
+    return p;
+}
+
+void operator delete(void* p, std::size_t n) noexcept
+{
+    constexpr auto A = alignof(strand_service_impl*);
+    std::size_t padded = (n + A - 1) & ~(A - 1);
+    auto* svc = *reinterpret_cast<strand_service_impl**>(
+        static_cast<char*>(p) + padded);
+
+    void* expected = nullptr;
+    if(!svc->invoker_frame_cache_.compare_exchange_strong(
+            expected, p, std::memory_order_release))
+        ::operator delete(p);
+}
+```
+
+The trailer holds a service pointer (lifetime: execution context),
+not an impl pointer (lifetime: per-strand). The invoker's `make_invoker`
+parameter is a shared_ptr stored in the coroutine frame; that one copy
+keeps the impl alive past any user-side strand drop. `operator delete`
+reads only the trailer (service-scoped), so impl may be dead at delete
+time without consequence.
+
+### Destruction
+
+```cpp
+strand_impl::~strand_impl()
+{
+    auto* svc = service_.load(std::memory_order_acquire);
+    if(!svc) return;
+    std::lock_guard lock(svc->mutex_);
+    svc->impl_list_.remove(this);
+}
+```
+
+`~strand_queue` (already implemented) destroys any pending wrappers
+without resuming them. That covers the case where work was queued but
+the inner executor never invoked the invoker before context teardown.
+
+### Shutdown
+
+```cpp
+void strand_service_impl::shutdown() override
+{
+    std::lock_guard lock(mutex_);
+    while(auto* p = impl_list_.pop_front())
+    {
+        std::lock_guard impl_lock(*p->mutex_);
+        p->locked_ = true;
+        p->service_.store(nullptr, std::memory_order_release);
+    }
+
+    void* fp = invoker_frame_cache_.exchange(
+        kCacheClosed, std::memory_order_acq_rel);
+    if(fp) ::operator delete(fp);
+}
+```
+
+After shutdown, user-held strands still own their impls via
+`shared_ptr`. When they drop, `~strand_impl` sees `service_ == nullptr`
+and short-circuits without touching service state, which may have been
+freed.
+
+### Lifetime cases
+
+1. **User drops strand, no work in flight.** Last `shared_ptr` drops;
+   `~strand_impl` unlinks; impl freed. `~strand_queue` discards any
+   wrappers (edge case only; `enqueue` posts the invoker on the
+   unlocked-to-locked transition, so wrappers are normally drained
+   before the strand becomes inactive).
+
+2. **User drops strand, invoker still running.** The invoker promise
+   holds the last `shared_ptr`; impl stays alive; invoker drains and
+   exits at `final_suspend`. Frame deletion order: promise destructor
+   (releases shared_ptr, runs `~strand_impl`), then `operator delete`
+   (recycles frame to service cache; service is still alive). Safe.
+
+3. **Service shutdown while user holds strand.** Shutdown unlinks the
+   impl from the list, marks it locked, and nulls its `service_`
+   back-pointer. When the user later drops the strand, `~strand_impl`
+   sees `service_ == nullptr` and short-circuits without touching
+   service state, which may have been freed.
+
+4. **Service shutdown with invoker queued but never invoked.** The
+   inner executor's destructor drops the queued continuation; the
+   coroutine handle is never destroyed; the promise's `shared_ptr`
+   never releases; impl and frame leak. Pre-existing behavior, not
+   introduced by this design.
+
+5. **Service shutdown with invoker mid-execution.** The invoker accesses
+   the service only via the trailer in `operator delete` (cache-slot
+   recycle). Shutdown sets the cache to `kCacheClosed`; concurrent
+   invokers see the sentinel and call `::operator delete` instead. The
+   service object itself must outlive any in-flight invoker. Capy's
+   `execution_context` teardown is responsible for stopping the inner
+   executor (which drains queued continuations) before destroying
+   services. This matches the contract the previous implementation
+   relied on.
+
+### Move semantics
+
+The documented contract is unchanged: "a moved-from strand is only safe
+to destroy or reassign." The moved-from `shared_ptr` is nullptr; calls
+on it dereference nullptr, which enforces the contract rather than
+merely documenting it. The previous design left the moved-from strand
+silently pointing at the same impl as the moved-to strand.
diff --git a/doc/unlisted/buffers-index.adoc b/doc/unlisted/buffers-index.adoc
index a3c4129e5..675b66300 100644
--- a/doc/unlisted/buffers-index.adoc
+++ b/doc/unlisted/buffers-index.adoc
@@ -182,7 +182,7 @@ task<void> receive_message(ReadStream auto& stream)
 {
     char header[4];
     auto [ec, n] = co_await read(stream, mutable_buffer(header, 4));
-    if (ec.failed())
+    if (ec)
         co_return;
 
     // Parse length from header
@@ -191,7 +191,7 @@ task<void> receive_message(ReadStream auto& stream)
     // Read the body
     std::vector<char> body(len);
     auto [ec2, n2] = co_await read(stream, make_buffer(body));
-    if (ec2.failed())
+    if (ec2)
         co_return;
 
     process_message(body);
@@ -209,7 +209,7 @@ task<void> send_response(WriteStream auto& stream, std::string_view body)
         + std::to_string(body.size()) + "\r\n\r\n";
 
     auto [ec1, n1] = co_await write(stream, make_buffer(headers));
-    if (ec1.failed())
+    if (ec1)
         co_return;
 
     // Write body
@@ -232,11 +232,11 @@ task<void> echo_loop(ReadStream auto& in, WriteStream auto& out)
         auto [ec1, n] = co_await in.read_some(make_buffer(buffer));
         if (ec1 == cond::eof)
             break;
-        if (ec1.failed())
+        if (ec1)
             co_return;
 
         auto [ec2, written] = co_await write(out, make_buffer(buffer, n));
-        if (ec2.failed())
+        if (ec2)
             co_return;
     }
 }
diff --git a/doc/unlisted/coroutines-when-all.adoc b/doc/unlisted/coroutines-when-all.adoc
index dc7cd131c..e58644e55 100644
--- a/doc/unlisted/coroutines-when-all.adoc
+++ b/doc/unlisted/coroutines-when-all.adoc
@@ -19,11 +19,15 @@ Tasks are sequential by default. When you await multiple tasks:
 
 [source,cpp]
 ----
-task<void> sequential()
+io_task<int> fetch_a();
+io_task<int> fetch_b();
+io_task<int> fetch_c();
+
+task<> sequential()
 {
-    int a = co_await fetch_a();  // Wait for A
-    int b = co_await fetch_b();  // Then wait for B
-    int c = co_await fetch_c();  // Then wait for C
+    auto [ec_a, a] = co_await fetch_a();  // Wait for A
+    auto [ec_b, b] = co_await fetch_b();  // Then wait for B
+    auto [ec_c, c] = co_await fetch_c();  // Then wait for C
     // Total time: A + B + C
 }
 ----
@@ -40,9 +44,9 @@ all of them to complete:
 ----
 #include <boost/capy/when_all.hpp>
 
-task<void> concurrent()
+task<> concurrent()
 {
-    auto [a, b, c] = co_await when_all(
+    auto [ec, a, b, c] = co_await when_all(
         fetch_a(),
         fetch_b(),
         fetch_c()
@@ -52,55 +56,80 @@ task<void> concurrent()
 ----
 
 All three fetches run in parallel. The `co_await` completes when the slowest
-one finishes.
+one finishes. Children must return `io_result<...>` (i.e., be `io_task<T>`).
 
 == Return Value
 
-`when_all` returns a tuple of results, with void types filtered out:
+`when_all` returns `task<io_result<R1, R2, ..., Rn>>`, a single `ec` followed by each child's payload type. Each `Ri` is the child's payload flattened: `io_result<T>` contributes `T`, `io_result<>` contributes `tuple<>`.
 
 [source,cpp]
 ----
-// All non-void: get a tuple of all results
-auto [x, y] = co_await when_all(
-    task_returning_int(),     // task<int>
-    task_returning_string()   // task<std::string>
+// All non-void: single ec + flattened payloads
+auto [ec, x, y] = co_await when_all(
+    task_returning_int(),     // io_task<int>
+    task_returning_string()   // io_task<std::string>
 );
-// x is int, y is std::string
+// ec is std::error_code, x is int, y is std::string
 
-// Mixed with void: void tasks don't contribute
-auto [value] = co_await when_all(
-    task_returning_int(),  // task<int>
-    task_void(),           // task<void> - no contribution
-    task_void()            // task<void> - no contribution
+// Mixed with void: io_task<> contributes tuple<>
+auto [ec2, a, b, c] = co_await when_all(
+    task_returning_int(),  // io_task<int>    — a is int
+    task_void(),           // io_task<>       — b is tuple<>
+    task_void()            // io_task<>       — c is tuple<>
 );
-// value is int (only non-void result)
 
-// All void: returns void
-co_await when_all(
+// All void: just check ec
+auto r = co_await when_all(
     task_void(),
     task_void()
 );
-// No tuple, no return value
+if (r.ec)
+    // handle error
 ----
 
-Results appear in the same order as the input tasks.
+Results appear in the same order as the input tasks. Values are only meaningful when `!ec`.
 
 == Error Handling
 
-Exceptions propagate from child tasks to the parent. When a task throws:
+There are two error paths. I/O errors are reported through the `ec` field of the returned `io_result`. Thrown exceptions are captured separately.
+
+=== I/O Errors (ec)
+
+When a child returns a non-zero `ec`:
+
+1. Stop is requested for sibling tasks
+2. All tasks are allowed to complete (or respond to stop)
+3. The first `ec` is propagated in the outer `io_result`
+
+[source,cpp]
+----
+task<> handle_errors()
+{
+    auto [ec, a, b] = co_await when_all(
+        might_fail_io(),
+        another_io_task()
+    );
+    if (ec)
+        std::cerr << "I/O error: " << ec.message() << "\n";
+}
+----
+
+=== Exceptions
+
+When a child throws:
 
 1. The exception is captured
 2. Stop is requested for sibling tasks
 3. All tasks are allowed to complete (or respond to stop)
-4. The first exception is rethrown
+4. The first exception is rethrown (exception takes priority over `ec`)
 
 [source,cpp]
 ----
-task<void> handle_errors()
+task<> handle_exceptions()
 {
     try {
         co_await when_all(
-            might_fail(),
+            might_throw(),
             another_task(),
             third_task()
         );
@@ -113,8 +142,9 @@ task<void> handle_errors()
 
 === First-Error Semantics
 
-Only the first exception is captured; subsequent exceptions are discarded.
-This matches the behavior of most concurrent frameworks.
+Only the first error wins, whether `ec` or exception. Subsequent errors
+are discarded. If both an exception and an `ec` occur, the exception takes
+priority.
 
 === Stop Propagation
 
@@ -123,20 +153,21 @@ that support cancellation can respond by exiting early:
 
 [source,cpp]
 ----
-task<void> cancellable_work()
+io_task<> cancellable_work()
 {
     auto token = co_await this_coro::stop_token;
     for (int i = 0; i < 1000; ++i)
     {
         if (token.stop_requested())
-            co_return;  // Exit early
+            co_return io_result<>{};  // Exit early
         co_await do_chunk(i);
     }
+    co_return io_result<>{};
 }
 
-task<void> example()
+task<> example()
 {
-    // If failing_task throws, cancellable_work sees stop_requested
+    // If failing_task returns ec, cancellable_work sees stop_requested
     co_await when_all(
         failing_task(),
         cancellable_work()
@@ -151,7 +182,7 @@ cancelled, all children see the request:
 
 [source,cpp]
 ----
-task<void> parent()
+task<> parent()
 {
     // Parent has a stop token from run_async
     co_await when_all(
@@ -173,7 +204,7 @@ All child tasks inherit the parent's executor affinity:
 
 [source,cpp]
 ----
-task<void> parent()  // Running on executor ex
+task<> parent()  // Running on executor ex
 {
     co_await when_all(
         child_a(),  // Runs on ex
@@ -212,19 +243,26 @@ run_async(pool.get_executor())(parent());
 
 [source,cpp]
 ----
-task<std::string> fetch(http_client& client, std::string url)
+io_task<std::string> fetch(http_client& client, std::string url)
 {
-    co_return co_await client.get(url);
+    auto [ec, body] = co_await client.get(url);
+    co_return io_result<std::string>{ec, std::move(body)};
 }
 
-task<void> fetch_all(http_client& client)
+task<> fetch_all(http_client& client)
 {
-    auto [home, about, contact] = co_await when_all(
+    auto [ec, home, about, contact] = co_await when_all(
         fetch(client, "https://example.com/"),
         fetch(client, "https://example.com/about"),
         fetch(client, "https://example.com/contact")
     );
 
+    if (ec)
+    {
+        std::cerr << "Fetch failed: " << ec.message() << "\n";
+        co_return;
+    }
+
     std::cout << "Home: " << home.size() << " bytes\n";
     std::cout << "About: " << about.size() << " bytes\n";
     std::cout << "Contact: " << contact.size() << " bytes\n";
@@ -252,13 +290,13 @@ Do NOT use `when_all` when:
 | Feature | Description
 
 | `when_all(tasks...)`
-| Launch tasks concurrently, wait for all
+| Launch `io_task` children concurrently, wait for all
 
 | Return type
-| Tuple of non-void results in input order
+| `task<io_result<R1, ..., Rn>>` — single `ec` + flattened payloads
 
 | Error handling
-| First exception propagated, siblings get stop
+| First `ec` or exception propagated, siblings get stop
 
 | Affinity
 | Children inherit parent's executor
diff --git a/doc/unlisted/coroutines-when-any.adoc b/doc/unlisted/coroutines-when-any.adoc
index 45348b0a2..eb21ca7c8 100644
--- a/doc/unlisted/coroutines-when-any.adoc
+++ b/doc/unlisted/coroutines-when-any.adoc
@@ -9,7 +9,7 @@
 
 = Racing Tasks
 
-In this tutorial, you will learn how to race multiple concurrent tasks using `when_any`, returning as soon as the first task completes. This pattern is essential for implementing timeouts, redundant requests, and speculative execution.
+In this tutorial, you will learn how to race multiple concurrent tasks using `when_any`, returning as soon as the first task *succeeds* (`!ec`). This pattern is essential for implementing redundant requests and speculative execution.
 
 By the end of this page, you will understand how to use both the variadic and range-based overloads of `when_any`, handle the result types correctly, and manage cancellation of sibling tasks.
 
@@ -26,150 +26,198 @@ NOTE: Code snippets assume `using namespace boost::capy;` is in effect.
 Sometimes you need the result from whichever task finishes first, not all of them. Common scenarios include:
 
 * Racing requests to multiple servers, using the first response
-* Implementing timeouts by racing against a timer
 * Speculative execution of multiple algorithms
 * Waiting for first available resource from a pool
 
 == when_any
 
 The `when_any` function launches multiple tasks concurrently and returns when
-the first one completes:
+the first one *succeeds* (`!ec`):
 
 [source,cpp]
 ----
 #include <boost/capy/when_any.hpp>
 
-task<void> race()
+task<> race()
 {
-    auto [index, result] = co_await when_any(
-        fetch_from_primary(),
-        fetch_from_backup()
+    auto result = co_await when_any(
+        fetch_from_primary(),   // io_task<Response>
+        fetch_from_backup()     // io_task<Response>
     );
+    // result is variant<error_code, Response, Response>
+    // result.index() == 0: all tasks failed
+    // result.index() == 1: primary won
+    // result.index() == 2: backup won
 }
 ----
 
-The return value is a `std::pair` containing two elements: `index` indicates which task completed first (0 for the first argument, 1 for the second), and `result` holds the winning task's return value in a variant.
+The return value is a `std::variant` with `error_code` at index 0 (failure/no winner) followed by one alternative per input task. Index 1 corresponds to the first argument, index 2 to the second, and so on. A task *wins* only if it returns `!ec`. Errors and exceptions do not count as winning.
 
-The winning task's result is returned immediately. All sibling tasks receive
-a stop request and are allowed to complete before `when_any` returns.
+Once a winner is found, all sibling tasks receive a stop request. All tasks
+complete before `when_any` returns.
 
 == Return Value
 
-`when_any` returns a `std::pair` containing the winner's index and result.
+The variadic `when_any` returns `task<std::variant<std::error_code, R1, R2, ..., Rn>>`. Index 0 is the failure case (`error_code`). Indices 1..N correspond to the winning child's payload.
 
 === Heterogeneous Tasks (Variadic)
 
-When racing tasks with different return types, the result is a variant:
+When racing tasks with different return types:
 
 [source,cpp]
 ----
-auto [index, result] = co_await when_any(
-    task_returning_int(),     // task<int>
-    task_returning_string()   // task<std::string>
+auto result = co_await when_any(
+    task_returning_int(),     // io_task<int>
+    task_returning_string()   // io_task<std::string>
 );
 
-if (index == 0)
-    std::cout << "Got int: " << std::get<int>(result) << "\n";
+if (result.index() == 0)
+    std::cerr << "All failed: " << std::get<0>(result).message() << "\n";
+else if (result.index() == 1)
+    std::cout << "Got int: " << std::get<1>(result) << "\n";
 else
-    std::cout << "Got string: " << std::get<std::string>(result) << "\n";
+    std::cout << "Got string: " << std::get<2>(result) << "\n";
 ----
 
-The `result` variable is a `std::variant<int, std::string>`. Use `index` to determine which alternative is active, then extract the value with `std::get`.
+The `result` variable is a `std::variant<std::error_code, int, std::string>`. Index 0 is the failure case. Use `result.index() - 1` to identify which child won.
 
 === Void Tasks
 
-Void tasks contribute `std::monostate` to the variant:
+`io_task<>` children contribute `std::tuple<>` to the variant:
 
 [source,cpp]
 ----
-auto [index, result] = co_await when_any(
-    task_returning_int(),  // task<int>
-    task_void()            // task<void>
+auto result = co_await when_any(
+    task_returning_int(),  // io_task<int>
+    task_void()            // io_task<>
 );
 
-if (index == 0)
-    std::cout << "Got int: " << std::get<int>(result) << "\n";
+if (result.index() == 0)
+    std::cerr << "All failed\n";
+else if (result.index() == 1)
+    std::cout << "Got int: " << std::get<1>(result) << "\n";
 else
-    std::cout << "Void task completed\n";
+    std::cout << "Void task won\n";
 ----
 
-Tasks returning `void` contribute `std::monostate` to the variant. In this example, `result` has type `std::variant<int, std::monostate>`. When the void task wins, check for `std::monostate` or use the index to detect it.
+Here `result` has type `std::variant<std::error_code, int, std::tuple<>>`. Index 0 is failure, index 1 is the int task, index 2 is the void task.
 
-=== Duplicate Types
+=== Same-Type Tasks
 
-The variant is deduplicated. When racing tasks with the same return type,
-use the index to identify which task won:
+The variant preserves one alternative per task, plus `error_code` at index 0.
+Use `.index()` to identify which task won:
 
 [source,cpp]
 ----
-auto [index, result] = co_await when_any(
-    fetch_from_server_a(),  // task<Response>
-    fetch_from_server_b(),  // task<Response>
-    fetch_from_server_c()   // task<Response>
+auto result = co_await when_any(
+    fetch_from_server_a(),  // io_task<Response>
+    fetch_from_server_b(),  // io_task<Response>
+    fetch_from_server_c()   // io_task<Response>
 );
+// result: variant<error_code, Response, Response, Response>
 
-auto response = std::get<Response>(result);
-std::cout << "Server " << index << " responded first\n";
+if (result.index() == 0)
+{
+    std::cerr << "All servers failed\n";
+}
+else
+{
+    std::visit([](auto const& v) {
+        if constexpr (!std::is_same_v<std::decay_t<decltype(v)>, std::error_code>)
+            std::cout << "Winner: " << v << "\n";
+    }, result);
+    std::cout << "Server " << (result.index() - 1) << " responded first\n";
+}
 ----
 
-When multiple tasks share the same return type, the variant is deduplicated to contain only unique types. Here, `result` is `std::variant<Response>` with a single alternative. The `index` value (0, 1, or 2) tells you which server responded first.
-
 === Homogeneous Tasks (Vector)
 
-For a dynamic number of tasks with the same type, use the vector overload:
+For a dynamic number of tasks with the same type, use the range overload:
 
 [source,cpp]
 ----
-std::vector<task<Response>> requests;
+std::vector<io_task<Response>> requests;
 for (auto& server : servers)
     requests.push_back(fetch_from(server));
 
-auto [index, response] = co_await when_any(std::move(requests));
-std::cout << "Server " << index << " responded: " << response << "\n";
+auto result = co_await when_any(std::move(requests));
+// result: variant<error_code, pair<size_t, Response>>
+
+if (result.index() == 0)
+{
+    std::cerr << "All failed: " << std::get<0>(result).message() << "\n";
+}
+else
+{
+    auto& [index, response] = std::get<1>(result);
+    std::cout << "Server " << index << " responded: " << response << "\n";
+}
 ----
 
-The vector overload accepts any sized input range of awaitables with the same result type. Since all tasks return `Response`, the result is `std::pair<std::size_t, Response>` directly—no variant wrapper is needed.
+The range overload returns `variant<error_code, pair<size_t, Response>>`. Index 0 is the failure case. Index 1 holds the winner's index and payload.
 
-For void tasks in a vector, only the winner's index is returned:
+For void tasks in a range:
 
 [source,cpp]
 ----
-std::vector<task<void>> tasks;
+std::vector<io_task<>> tasks;
 // ... populate tasks
 
-std::size_t winner = co_await when_any(std::move(tasks));
-std::cout << "Task " << winner << " completed first\n";
+auto result = co_await when_any(std::move(tasks));
+// result: variant<error_code, size_t>
+
+if (result.index() == 1)
+    std::cout << "Task " << std::get<1>(result) << " completed first\n";
 ----
 
-Since void tasks produce no result value, the return type is `std::size_t` rather than a pair.
+Since void tasks produce no payload, the success alternative is just `std::size_t` (the winner's index).
 
 == Error Handling
 
-Exceptions are treated as valid completions. If the winning task throws,
-that exception is rethrown from `when_any`:
+Only a successful task (`!ec`) can win. Tasks that return an error code or throw an exception do *not* win. `when_any` keeps waiting for a success.
+
+=== All Tasks Fail
+
+If every task returns `ec`, the variant holds `error_code` at index 0:
+
+[source,cpp]
+----
+task<> handle_all_fail()
+{
+    auto result = co_await when_any(
+        might_fail_a(),
+        might_fail_b()
+    );
+    if (result.index() == 0)
+        std::cerr << "No winner: " << std::get<0>(result).message() << "\n";
+}
+----
+
+=== All Tasks Throw
+
+If every task throws, the first exception is rethrown:
 
 [source,cpp]
 ----
-task<void> handle_errors()
+task<> handle_all_throw()
 {
     try {
-        auto [index, result] = co_await when_any(
-            might_fail(),
-            might_succeed()
+        co_await when_any(
+            throws_a(),
+            throws_b()
         );
-        // If we get here, the winner succeeded
     } catch (std::exception const& e) {
-        // The winning task threw this exception
-        std::cerr << "Winner failed: " << e.what() << "\n";
+        // All threw — first exception is rethrown
+        std::cerr << "Error: " << e.what() << "\n";
     }
 }
 ----
 
-=== First-Completion Semantics
+=== Success-Only Winner Semantics
 
-Unlike `when_all` (which captures the first _error_), `when_any` returns
-whichever task completes first, whether it succeeds or fails. Exceptions
-from non-winning tasks are discarded.
+Unlike `when_all` (which propagates the first error), `when_any` treats errors
+as non-winning. A failed task does not disqualify pending siblings; they keep
+running until one succeeds or all complete.
 
 === Stop Propagation
 
@@ -178,23 +226,23 @@ Tasks that support cancellation can exit early:
 
 [source,cpp]
 ----
-task<Response> fetch_with_cancel_support()
+io_task<Response> fetch_with_cancel_support()
 {
-    auto token = co_await get_stop_token();
+    auto token = co_await this_coro::stop_token;
 
     for (auto& chunk : data_source)
     {
         if (token.stop_requested())
-            co_return partial_response();  // Exit early
+            co_return io_result<Response>{{}, partial_response()};
         co_await send_chunk(chunk);
     }
-    co_return complete_response();
+    co_return io_result<Response>{{}, complete_response()};
 }
 
-task<void> example()
+task<> example()
 {
     // When one fetch wins, the other sees stop_requested
-    auto [index, response] = co_await when_any(
+    auto result = co_await when_any(
         fetch_with_cancel_support(),
         fetch_with_cancel_support()
     );
@@ -211,9 +259,9 @@ cancelled, all children see the request:
 
 [source,cpp]
 ----
-task<void> parent()
+task<> parent()
 {
-    auto [index, result] = co_await when_any(
+    auto result = co_await when_any(
         child_a(),  // Sees parent's stop token
         child_b()   // Sees parent's stop token
     );
@@ -222,7 +270,7 @@ task<void> parent()
 std::stop_source source;
 run_async(ex, source.get_token())(parent());
 
-// Later: cancel everything
+// Later: cancel everything — variant holds error_code at index 0
 source.request_stop();
 ----
 
@@ -232,9 +280,9 @@ All child tasks inherit the parent's executor affinity:
 
 [source,cpp]
 ----
-task<void> parent()  // Running on executor ex
+task<> parent()  // Running on executor ex
 {
-    auto [index, result] = co_await when_any(
+    auto result = co_await when_any(
         child_a(),  // Runs on ex
         child_b()   // Runs on ex
     );
@@ -265,69 +313,55 @@ Race requests to multiple servers for reliability:
 
 [source,cpp]
 ----
-task<Response> fetch_with_redundancy(Request req)
+io_task<Response> fetch_with_redundancy(Request req)
 {
-    auto [index, response] = co_await when_any(
+    auto result = co_await when_any(
         fetch_from(primary_server, req),
         fetch_from(backup_server, req)
     );
+    // result: variant<error_code, Response, Response>
 
-    std::cout << (index == 0 ? "Primary" : "Backup")
-              << " server responded\n";
-    co_return std::get<Response>(response);
-}
-----
+    if (result.index() == 0)
+        co_return io_result<Response>{std::get<0>(result), {}};
 
-== Example: Timeout Pattern
-
-Race an operation against a timer:
-
-[source,cpp]
-----
-task<Data> fetch_with_timeout(Request req)
-{
-    auto [index, result] = co_await when_any(
-        fetch_data(req),
-        timeout_after<Data>(100ms)
-    );
-
-    if (index == 1)
-        throw timeout_error{"Request timed out"};
-
-    co_return std::get<Data>(result);
-}
+    std::cout << (result.index() == 1 ? "Primary" : "Backup")
+              << " server responded\n";
 
-// Helper that waits then throws
-template<typename T>
-task<T> timeout_after(std::chrono::milliseconds ms)
-{
-    co_await sleep(ms);
-    throw timeout_error{"Timeout"};
+    // Extract the winner's Response
+    Response resp;
+    std::visit([&](auto const& v) {
+        if constexpr (!std::is_same_v<std::decay_t<decltype(v)>, std::error_code>)
+            resp = v;
+    }, result);
+    co_return io_result<Response>{{}, std::move(resp)};
 }
 ----
 
-The `timeout_after` helper waits for the specified duration then throws an exception. If `fetch_data` completes before the timer, its result is returned. If the timer wins, the timeout exception propagates from `when_any`.
-
 == Example: First Available Resource
 
 Wait for the first available connection from a pool:
 
 [source,cpp]
 ----
-task<Connection> get_connection(std::vector<ConnectionPool>& pools)
+io_task<Connection> get_connection(std::vector<ConnectionPool>& pools)
 {
-    std::vector<task<Connection>> attempts;
+    std::vector<io_task<Connection>> attempts;
     for (auto& pool : pools)
         attempts.push_back(pool.acquire());
 
-    auto [index, conn] = co_await when_any(std::move(attempts));
+    auto result = co_await when_any(std::move(attempts));
+    // result: variant<error_code, pair<size_t, Connection>>
+
+    if (result.index() == 0)
+        co_return io_result<Connection>{std::get<0>(result), {}};
 
+    auto& [index, conn] = std::get<1>(result);
     std::cout << "Got connection from pool " << index << "\n";
-    co_return conn;
+    co_return io_result<Connection>{{}, std::move(conn)};
 }
 ----
 
-This function creates an acquire task for each pool, then races them. Whichever pool provides a connection first wins, and the remaining acquire attempts are cancelled. The `index` indicates which pool provided the connection.
+This function creates an acquire task for each pool, then races them. Whichever pool provides a connection first (with `!ec`) wins, and the remaining acquire attempts are cancelled. The `index` indicates which pool provided the connection.
 
 == Comparison with when_all
 
@@ -337,19 +371,19 @@ This function creates an acquire task for each pool, then races them. Whichever
 
 | Completion
 | Waits for all tasks
-| Returns on first completion
+| Returns on first success (`!ec`)
 
 | Return type
-| Tuple of results
-| Pair of (index, variant/value)
+| `task<io_result<R1, ..., Rn>>`
+| `task<variant<error_code, R1, ..., Rn>>`
 
 | Error handling
-| First exception wins, siblings get stop
-| Exceptions are valid completions
+| First `ec` or exception wins, siblings get stop
+| Only `!ec` wins; errors do not win
 
 | Use case
 | Need all results
-| Need fastest result
+| Need fastest successful result
 |===
 
 == Summary
@@ -359,22 +393,25 @@ This function creates an acquire task for each pool, then races them. Whichever
 | Feature | Description
 
 | `when_any(tasks...)`
-| Race tasks, return first completion
+| Race `io_task` children, return first success
 
-| `when_any(vector<task<T>>)`
-| Race homogeneous tasks from a vector
+| `when_any(range)`
+| Race homogeneous `io_task` children from a range
 
 | Return type (variadic)
-| `pair<size_t, variant<...>>` with deduplicated types
+| `variant<error_code, R1, ..., Rn>` — index 0 is failure, 1..N is winner
 
-| Return type (vector)
-| `pair<size_t, T>` or `size_t` for void
+| Return type (range, non-void)
+| `variant<error_code, pair<size_t, T>>`
 
-| Error handling
-| Winner's exception propagated, others discarded
+| Return type (range, void)
+| `variant<error_code, size_t>`
+
+| Winner selection
+| Only `!ec` wins; errors and exceptions do not count
 
 | Stop propagation
-| Siblings receive stop request on winner
+| Siblings receive stop request when winner found
 
 | Cleanup
 | All tasks complete before returning
diff --git a/doc/unlisted/execution-executors.adoc b/doc/unlisted/execution-executors.adoc
index 81ee0dc53..315ab1133 100644
--- a/doc/unlisted/execution-executors.adoc
+++ b/doc/unlisted/execution-executors.adoc
@@ -40,15 +40,15 @@ A dispatcher is a callable that schedules coroutine resumption:
 
 [source,cpp]
 ----
-template<typename D, typename P = void>
-concept dispatcher = requires(D const& d, std::coroutine_handle<P> h) {
-    { d(h) } -> std::convertible_to<std::coroutine_handle<>>;
+template<typename D>
+concept dispatcher = requires(D const& d, continuation& c) {
+    { d(c) } -> std::convertible_to<std::coroutine_handle<>>;
 };
 ----
 
-When invoked with a coroutine handle, the dispatcher:
+When invoked with a `continuation&`, the dispatcher:
 
-1. Schedules the handle for resumption (inline or queued)
+1. Schedules the continuation for resumption (inline or queued)
 2. Returns a handle suitable for symmetric transfer
 
 === Example Dispatcher
@@ -57,9 +57,9 @@ When invoked with a coroutine handle, the dispatcher:
 ----
 struct inline_dispatcher
 {
-    std::coroutine_handle<> operator()(std::coroutine_handle<> h) const
+    std::coroutine_handle<> operator()(continuation& c) const
     {
-        return h;  // Resume inline via symmetric transfer
+        return c.h;  // Resume inline via symmetric transfer
     }
 };
 
@@ -67,9 +67,9 @@ struct queuing_dispatcher
 {
     work_queue* queue_;
 
-    std::coroutine_handle<> operator()(std::coroutine_handle<> h) const
+    std::coroutine_handle<> operator()(continuation& c) const
     {
-        queue_->push(h);
+        queue_->push(c.h);
         return std::noop_coroutine();  // Caller returns to event loop
     }
 };
@@ -85,13 +85,12 @@ template<class E>
 concept executor =
     std::copy_constructible<E> &&
     std::equality_comparable<E> &&
-    requires(E const& ce, std::coroutine_handle<> h) {
+    requires(E const& ce, continuation& c) {
         { ce.context() } -> /* reference to execution context */;
         { ce.on_work_started() } noexcept;
         { ce.on_work_finished() } noexcept;
-        { ce(h) } -> std::convertible_to<std::coroutine_handle<>>;
-        { ce.post(h) };
-        { ce.defer(h) };
+        { ce.dispatch(c) } -> std::same_as<std::coroutine_handle<>>;
+        { ce.post(c) };
     };
 ----
 
@@ -101,21 +100,17 @@ concept executor =
 |===
 | Operation | Behavior
 
-| `operator()(h)`
-| Run inline if safe, otherwise queue. Cheapest path. Also serves as dispatcher interface.
+| `dispatch(c)`
+| Run inline if safe, otherwise queue. Returns `std::coroutine_handle<>` for symmetric transfer.
 
-| `post(h)`
+| `post(c)`
 | Always queue, never inline. Guaranteed asynchrony.
-
-| `defer(h)`
-| Always queue with "this is my continuation" hint. Enables optimizations.
 |===
 
 **When to use each:**
 
-* `operator()` — Default choice. Allows the executor to optimize.
+* `dispatch` — Default choice. Allows the executor to optimize with inline resumption.
 * `post` — When you need guaranteed asynchrony (e.g., releasing a lock first).
-* `defer` — When posting your own continuation (enables thread-local queuing).
 
 === Work Tracking
 
@@ -141,7 +136,7 @@ Awaitables that participate in affinity propagation implement `affine_awaitable`
 ----
 template<typename A, typename D, typename P = void>
 concept affine_awaitable =
-    dispatcher<D, P> &&
+    dispatcher<D> &&
     requires(A a, std::coroutine_handle<P> h, D const& d) {
         a.await_suspend(h, d);
     };
@@ -158,7 +153,7 @@ Awaitables with cancellation support implement `stoppable_awaitable`:
 ----
 template<typename A, typename D, typename P = void>
 concept stoppable_awaitable =
-    affine_awaitable<A, D, P> &&
+    affine_awaitable<A, D> &&
     requires(A a, std::coroutine_handle<P> h, D const& d, std::stop_token t) {
         a.await_suspend(h, d, t);
     };
@@ -171,7 +166,7 @@ Stoppable awaitables provide _both_ overloads of `await_suspend`.
 Executors have specific thread safety guarantees:
 
 * Copy constructor, comparison, `context()` — always thread-safe
-* `operator()`, `post`, `defer` — thread-safe for concurrent calls
+* `dispatch`, `post` — thread-safe for concurrent calls
 * `on_work_started`, `on_work_finished` — thread-safe, must not throw
 
 == Executor Validity
@@ -184,7 +179,7 @@ thread_pool pool;
 auto ex = pool.get_executor();
 // When pool is destroyed, ex becomes invalid
 
-// WARNING: Calling ex() after pool destruction is undefined behavior
+// WARNING: Calling ex.dispatch() after pool destruction is undefined behavior
 ----
 
 The copy constructor and `context()` remain valid until the context is
diff --git a/doc/unlisted/execution-strand.adoc b/doc/unlisted/execution-strand.adoc
index 74f081445..453da73c3 100644
--- a/doc/unlisted/execution-strand.adoc
+++ b/doc/unlisted/execution-strand.adoc
@@ -220,15 +220,14 @@ void callback()
 
 == Implementation Notes
 
-Capy's strand uses a fixed pool of 211 implementation objects. New strands
-hash to select an impl from the pool. Strands that hash to the same index
-share serialization:
-
-* This is harmless — just extra serialization
-* Rare with 211 buckets
-* No allocation for strand creation
-
-This design trades minimal extra serialization for zero per-strand allocation.
+Each strand owns a private serialization state. Strands sharing an
+execution context draw from a small pool of mutexes (193 entries) for
+their internal critical sections; mutex sharing causes only brief
+contention on push/pop, never cross-strand state sharing.
+
+* Construction cost: one `std::make_shared` per strand
+* Two distinct strands always serialize independently
+* The mutex pool keeps memory bounded as strand count grows
 
 == When NOT to Use Strands
 
diff --git a/doc/unlisted/execution-thread-pool.adoc b/doc/unlisted/execution-thread-pool.adoc
index 89d6bd603..71b1e4b34 100644
--- a/doc/unlisted/execution-thread-pool.adoc
+++ b/doc/unlisted/execution-thread-pool.adoc
@@ -92,28 +92,68 @@ run_async(pool.get_executor())(compute(), [](int result) {
 
 == Lifetime and Shutdown
 
-The pool destructor waits for all work to complete:
+=== Waiting for Work: join()
+
+Call `join()` to block until all outstanding work completes:
+
+[source,cpp]
+----
+thread_pool pool(4);
+
+for (auto& item : batch)
+    run_async(pool.get_executor())(process(item));
+
+pool.join();  // Blocks until all tasks finish
+// Pool is now stopped; worker threads are joined
+----
+
+`join()` releases the pool's internal work guard and blocks until the
+outstanding work count (tracked by `on_work_started()` / `on_work_finished()`)
+reaches zero. After all work completes, the worker threads are joined.
+
+The pool cannot be reused after `join()`. Calling `join()` more than once
+is safe (subsequent calls are no-ops).
+
+=== Immediate Stop: stop()
+
+Call `stop()` to abandon remaining work:
+
+[source,cpp]
+----
+pool.stop();   // Workers exit after current item; queued work is abandoned
+pool.join();   // Wait for threads to finish
+----
+
+If `join()` is blocking on another thread, calling `stop()` causes it to
+stop waiting for outstanding work. The `join()` call still waits for worker
+threads to finish their current item and exit before returning.
+
+=== Destructor Behavior
+
+The destructor calls `stop()` then `join()`:
 
 [source,cpp]
 ----
 {
     thread_pool pool(4);
     run_async(pool.get_executor())(long_running_task());
-    // Destructor blocks until long_running_task completes
+    // Destructor: stop() -> join() -> shutdown services -> destroy services
+    // Queued work that hasn't started is abandoned
 }
 ----
 
-This ensures orderly shutdown without orphaned coroutines.
+To wait for all work to complete before shutdown, call `join()` explicitly
+before the pool goes out of scope.
 
 === Destruction Order
 
 When a pool is destroyed:
 
-1. Threads are signaled to stop accepting new work
-2. Pending work continues to completion
+1. Workers are signaled to stop (pending work is abandoned)
+2. Worker threads are joined
 3. Services are shut down (in reverse order of creation)
 4. Services are destroyed
-5. Threads are joined
+5. Remaining queued work items are destroyed
 
 == Executor Operations
 
@@ -161,7 +201,9 @@ Since callers are never "inside" the thread pool's execution context,
 
 == Work Tracking
 
-Work tracking keeps the pool alive while operations are outstanding:
+Work tracking keeps the pool alive while operations are outstanding.
+When `join()` has been called, the pool waits until the outstanding work
+count reaches zero before stopping the worker threads.
 
 [source,cpp]
 ----
@@ -179,13 +221,14 @@ The `work_guard` RAII wrapper simplifies this:
 {
     work_guard guard(ex);
     // Work count incremented
-    
+
     // ... do work ...
-    
+
 }  // Work count decremented
 ----
 
-`run_async` handles work tracking automatically.
+`run_async` handles work tracking automatically — each launched task
+holds a `work_guard` for its lifetime.
 
 == Services
 
@@ -323,6 +366,12 @@ void process_batch()
 | `get_executor()`
 | Get an executor for the pool
 
+| `join()`
+| Wait for all outstanding work to complete
+
+| `stop()`
+| Immediately stop the pool, abandoning queued work
+
 | Services
 | Polymorphic components owned by the pool
 |===
diff --git a/doc/unlisted/io-awaitables-executor.adoc b/doc/unlisted/io-awaitables-executor.adoc
index 7394163ea..d5644a2cd 100644
--- a/doc/unlisted/io-awaitables-executor.adoc
+++ b/doc/unlisted/io-awaitables-executor.adoc
@@ -20,9 +20,9 @@ the executor determines _where_ and _when_ the coroutine resumes:
 [source,cpp]
 ----
 // Completion arrives on I/O thread
-void on_io_complete(std::coroutine_handle<> h)
+void on_io_complete(continuation& c)
 {
-    executor.dispatch(h).resume();  // Resume on executor's context
+    executor.dispatch(c).resume();  // Resume on executor's context
 }
 ----
 
@@ -36,10 +36,10 @@ Capy's `Executor` concept requires two operations:
 [source,cpp]
 ----
 template<typename Ex>
-concept Executor = requires(Ex const& ex, std::coroutine_handle<> h)
+concept Executor = requires(Ex const& ex, continuation& c)
 {
-    { ex.dispatch(h) } -> std::same_as<std::coroutine_handle<>>;
-    { ex.post(h) } -> std::same_as<void>;
+    { ex.dispatch(c) } -> std::same_as<std::coroutine_handle<>>;
+    { ex.post(c) } -> std::same_as<void>;
 };
 ----
 
@@ -52,11 +52,11 @@ Returns a handle to resume. The implementation decides whether to:
 
 [source,cpp]
 ----
-std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const
+std::coroutine_handle<> dispatch(continuation& c) const
 {
     if (running_in_this_context())
-        return h;  // Resume inline
-    queue_work(h);
+        return c.h;  // Resume inline
+    queue_work(c.h);
     return std::noop_coroutine();  // Don't resume now
 }
 ----
@@ -69,9 +69,9 @@ Always queues, never executes inline:
 
 [source,cpp]
 ----
-void post(std::coroutine_handle<> h) const
+void post(continuation& c) const
 {
-    queue_work(h);  // Always queue, never inline
+    queue_work(c.h);  // Always queue, never inline
 }
 ----
 
@@ -92,8 +92,8 @@ thread_pool pool(4);
 executor_ref ex = pool.get_executor();
 
 // Use uniformly
-ex.dispatch(handle);
-ex.post(handle);
+ex.dispatch(cont);
+ex.post(cont);
 ----
 
 === Why Type Erasure?
@@ -128,7 +128,7 @@ executor_ref ex = some_executor;
 
 // Check if valid
 if (ex)
-    ex.dispatch(h);
+    ex.dispatch(c);
 
 // Compare (pointer equality on underlying executor)
 executor_ref ex2 = some_executor;
@@ -288,7 +288,7 @@ task<void> example()
     }
 
     // Use for manual dispatch
-    ex.post(some_handle);
+    ex.post(some_continuation);
 }
 ----
 
@@ -307,11 +307,11 @@ actually suspends—`await_ready()` returns `true`.
 | `executor_ref`
 | Type-erased executor wrapper
 
-| `dispatch(h)`
-| Resume inline if safe, queue otherwise
+| `dispatch(c)`
+| Resume inline if safe, queue otherwise. Takes `continuation&`, returns `std::coroutine_handle<>`.
 
-| `post(h)`
-| Always queue, never inline
+| `post(c)`
+| Always queue, never inline. Takes `continuation&`.
 
 | `run(ex)(task)`
 | Override inherited executor for a subtask
diff --git a/doc/unlisted/library-buffers.adoc b/doc/unlisted/library-buffers.adoc
index f2d190030..de00e9366 100644
--- a/doc/unlisted/library-buffers.adoc
+++ b/doc/unlisted/library-buffers.adoc
@@ -35,14 +35,14 @@ task<void> echo(ReadStream auto& in, WriteStream auto& out)
     for (;;)
     {
         auto [ec1, n] = co_await in.read_some(make_buffer(buffer));
-        if (ec1 == cond::eof)
-            break;
-        if (ec1.failed())
-            co_return;
 
         auto [ec2, _] = co_await write(out, make_buffer(buffer, n));
-        if (ec2.failed())
-            co_return;
+
+        if (ec1)
+            break;
+
+        if (ec2)
+            break;
     }
 }
 ----
@@ -128,8 +128,8 @@ task<void> echo(ReadStream auto& in, WriteStream auto& out)
 | `<boost/capy/buffers/buffer_copy.hpp>`
 | `buffer_copy` algorithm
 
-| `<boost/capy/buffers/consuming_buffers.hpp>`
-| Incremental buffer consumption
+| `<boost/capy/buffers/buffer_slice.hpp>`
+| Byte sub-range slicing algorithm
 
 | `<boost/capy/buffers/flat_dynamic_buffer.hpp>`
 | Contiguous dynamic buffer
diff --git a/doc/unlisted/library-io-result.adoc b/doc/unlisted/library-io-result.adoc
index 3c808e7b4..5503c1265 100644
--- a/doc/unlisted/library-io-result.adoc
+++ b/doc/unlisted/library-io-result.adoc
@@ -52,7 +52,7 @@ Explicit, but:
 
 // Returns error code + bytes transferred
 auto [ec, n] = co_await stream.read_some(buffer);
-if (ec.failed())
+if (ec)
     handle_error(ec);
 process_data(buffer, n);
 ----
@@ -75,7 +75,7 @@ Operations that succeed or fail without a result value:
 [source,cpp]
 ----
 auto [ec] = co_await socket.connect(endpoint);
-if (ec.failed())
+if (ec)
     co_return handle_connection_error(ec);
 ----
 
@@ -86,7 +86,7 @@ Operations returning a single value, like bytes transferred:
 [source,cpp]
 ----
 auto [ec, n] = co_await stream.read_some(buffer);
-if (ec.failed())
+if (ec)
     co_return;
 buffer.commit(n);  // n bytes were read
 ----
@@ -138,7 +138,7 @@ io_result<>                      → [ec]
 [source,cpp]
 ----
 auto [ec, n] = co_await stream.read_some(buffer);
-if (ec.failed())
+if (ec)
 {
     log_error(ec);
     co_return;  // Or throw, or return error
@@ -151,7 +151,7 @@ if (ec.failed())
 [source,cpp]
 ----
 auto [ec, n] = co_await stream.read_some(buffer);
-if (ec.failed() && ec != error::operation_aborted)
+if (ec && ec != error::operation_aborted)
 {
     // Handle unexpected errors
     throw system_error(ec);
@@ -164,7 +164,7 @@ if (ec.failed() && ec != error::operation_aborted)
 [source,cpp]
 ----
 auto [ec, n] = co_await stream.read_some(buffer);
-if (ec.failed())
+if (ec)
     throw system_error(ec);
 ----
 
@@ -180,7 +180,7 @@ io_task<std::size_t> read_all(stream& s, buffer& buf)
     while (buf.size() < buf.max_size())
     {
         auto [ec, n] = co_await s.read_some(buf.prepare(1024));
-        if (ec.failed())
+        if (ec)
             co_return {ec, total};  // Return partial progress
         buf.commit(n);
         total += n;
@@ -213,7 +213,7 @@ common) and exceptions for programmer errors (bugs, invariant violations).
 [source,cpp]
 ----
 auto [ec, n] = co_await stream.read_some(buffer);
-if (ec.failed())
+if (ec)
     throw system_error(ec, "read failed");
 // Continue...
 ----
@@ -226,7 +226,7 @@ template<typename... Args>
 auto throw_on_error(io_result<Args...> result)
 {
     auto& [ec, rest...] = result;
-    if (ec.failed())
+    if (ec)
         throw system_error(ec);
     return std::tie(rest...);
 }
@@ -248,7 +248,7 @@ task<void> read_all(stream& s, dynamic_buffer& buf)
         auto [ec, n] = co_await s.read_some(buf.prepare(1024));
         if (ec == error::end_of_stream)
             co_return;  // Normal completion
-        if (ec.failed())
+        if (ec)
             throw system_error(ec);
         buf.commit(n);
     }
@@ -264,7 +264,7 @@ io_task<> write_with_retry(stream& s, const_buffer data, int retries)
     for (int i = 0; i < retries; ++i)
     {
         auto [ec, n] = co_await s.write_some(data);
-        if (!ec.failed())
+        if (!ec)
         {
             data += n;
             if (data.size() == 0)
@@ -300,7 +300,7 @@ io_task<> write_with_retry(stream& s, const_buffer data, int retries)
 | `auto [ec, n] = ...`
 
 | Error check
-| `ec.failed()` (not `ec` or `!!ec`)
+| `if(ec)` or `if(!ec)`
 |===
 
 == Next Steps
diff --git a/doc/unlisted/library-streams.adoc b/doc/unlisted/library-streams.adoc
index 66460ca43..9718377cf 100644
--- a/doc/unlisted/library-streams.adoc
+++ b/doc/unlisted/library-streams.adoc
@@ -107,7 +107,7 @@ concept ReadStream = requires(T& stream, MutableBufferSequence auto buffers)
 
 ==== Behavior
 
-`read_some()` reads **at least one byte** (unless EOF or error) but may
+`read_some()` attempts to read up to `buffer_size(buffers)` bytes but may
 read fewer bytes than the buffer can hold:
 
 [source,cpp]
@@ -121,9 +121,12 @@ This matches OS behavior: `recv()` returns whatever data is currently available.
 
 ==== Return Values
 
-* **Success**: `!ec.failed()` is true, `n >= 1`
-* **EOF**: `ec == cond::eof`, `n == 0`
-* **Error**: `ec.failed()` is true, `n == 0`
+If `buffer_size(buffers) > 0`:
+
+* If `!ec`: `n >= 1 && n \<= buffer_size(buffers)`. `n` bytes were read.
+* If `ec`: `n >= 0 && n \<= buffer_size(buffers)`. `n` is the number of bytes read before the I/O condition arose.
+
+If `buffer_empty(buffers)` is true, `n` is 0. The empty buffer is not itself a cause for error, but `ec` may reflect the state of the stream.
 
 ==== Example
 
@@ -133,7 +136,7 @@ task<std::string> read_available(ReadStream auto& stream)
 {
     char buffer[4096];
     auto [ec, n] = co_await stream.read_some(make_buffer(buffer));
-    if (ec.failed())
+    if (ec)
         co_return {};
     co_return std::string(buffer, n);
 }
@@ -157,7 +160,7 @@ concept WriteStream = requires(T& stream, ConstBufferSequence auto buffers)
 
 ==== Behavior
 
-`write_some()` writes **at least one byte** (unless error) but may write
+`write_some()` attempts to write up to `buffer_size(buffers)` bytes but may write
 fewer bytes than provided:
 
 [source,cpp]
@@ -172,8 +175,12 @@ kernel buffers are constrained.
 
 ==== Return Values
 
-* **Success**: `!ec.failed()` is true, `n >= 1`
-* **Error**: `ec.failed()` is true, `n == 0`
+If `buffer_size(buffers) > 0`:
+
+* If `!ec`: `n >= 1 && n \<= buffer_size(buffers)`. `n` bytes were written.
+* If `ec`: `n >= 0 && n \<= buffer_size(buffers)`. `n` is the number of bytes written before the I/O condition arose.
+
+If `buffer_empty(buffers)` is true, `n` is 0. The empty buffer is not itself a cause for error, but `ec` may reflect the state of the stream.
 
 ==== Example
 
@@ -182,7 +189,7 @@ kernel buffers are constrained.
 task<> write_chunk(WriteStream auto& stream, std::string_view data)
 {
     auto [ec, n] = co_await stream.write_some(make_buffer(data));
-    if (ec.failed())
+    if (ec)
         co_return;
     // Only n bytes were written; caller must handle remainder
 }
@@ -222,9 +229,9 @@ auto [ec, n] = co_await source.read(mutable_buffer(header, 16));
 
 ==== Return Values
 
-* **Success**: `!ec.failed()` is true, `n == buffer_size(buffers)`
+* **Success**: `!ec`, `n == buffer_size(buffers)`
 * **EOF**: `ec == cond::eof`, `n` is bytes read before EOF
-* **Error**: `ec.failed()` is true, `n` is bytes read before error
+* **Error**: `ec`, `n` is bytes read before error
 
 ==== Example
 
@@ -234,7 +241,7 @@ task<std::vector<char>> read_exact(ReadSource auto& source, std::size_t count)
 {
     std::vector<char> result(count);
     auto [ec, n] = co_await source.read(make_buffer(result));
-    if (ec.failed())
+    if (ec)
         co_return {};
     co_return result;
 }
@@ -285,8 +292,8 @@ are permitted.
 
 ==== Return Values
 
-* **Success**: `!ec.failed()` is true, `n == buffer_size(buffers)`
-* **Error**: `ec.failed()` is true, `n` is bytes written before error
+* **Success**: `!ec`, `n == buffer_size(buffers)`
+* **Error**: `ec`, `n` is bytes written before error
 
 Reference: `<boost/capy/concept/write_sink.hpp>`
 
@@ -324,9 +331,9 @@ auto [ec, count] = co_await source.pull(arr, 16);
 
 ==== Return Values
 
-* **Data available**: `!ec.failed()` is true, `count > 0`
-* **Source exhausted**: `!ec.failed()` is true, `count == 0`
-* **Error**: `ec.failed()` is true
+* **Data available**: `!ec`, `count > 0`
+* **Source exhausted**: `!ec`, `count == 0`
+* **Error**: `ec`
 
 ==== Buffer Lifetime
 
@@ -344,7 +351,7 @@ task<> consume_all(BufferSource auto& source)
     for (;;)
     {
         auto [ec, count] = co_await source.pull(arr, 16);
-        if (ec.failed())
+        if (ec)
             co_return;
         if (count == 0)
             break;  // Source exhausted
@@ -418,7 +425,7 @@ task<> produce_data(BufferSink auto& sink, std::string_view data)
         {
             // No space; flush and retry
             auto [ec] = co_await sink.commit(0);
-            if (ec.failed())
+            if (ec)
                 co_return;
             continue;
         }
@@ -434,7 +441,7 @@ task<> produce_data(BufferSink auto& sink, std::string_view data)
         }
 
         auto [ec] = co_await sink.commit(written);
-        if (ec.failed())
+        if (ec)
             co_return;
     }
 
@@ -633,11 +640,11 @@ task<> echo(any_stream& stream)
         auto [ec, n] = co_await stream.read_some(make_buffer(buffer));
         if (ec == cond::eof)
             break;
-        if (ec.failed())
+        if (ec)
             co_return;
 
         auto [wec, wn] = co_await write(stream, make_buffer(buffer, n));
-        if (wec.failed())
+        if (wec)
             co_return;
     }
 }
@@ -706,12 +713,12 @@ task<> echo_server(any_stream& stream)
         auto [ec, n] = co_await stream.read_some(make_buffer(buffer));
         if (ec == cond::eof)
             break;
-        if (ec.failed())
+        if (ec)
             co_return;
 
         // Complete write: send all bytes
         auto [wec, _] = co_await write(stream, make_buffer(buffer, n));
-        if (wec.failed())
+        if (wec)
             co_return;
     }
 }
@@ -725,7 +732,7 @@ task<> send_response_body(any_write_sink& body, std::string_view content)
 {
     // Write all content and signal EOF
     auto [ec, n] = co_await body.write(make_buffer(content), true);
-    if (ec.failed())
+    if (ec)
     {
         // Handle error
     }
diff --git a/doc/unlisted/library-testing.adoc b/doc/unlisted/library-testing.adoc
index 61b26ba65..c22919377 100644
--- a/doc/unlisted/library-testing.adoc
+++ b/doc/unlisted/library-testing.adoc
@@ -93,11 +93,11 @@ Systematic error injection that tests every failure path:
 fuse f;
 auto r = f.armed([](fuse& f) {
     auto ec = f.maybe_fail();
-    if (ec.failed())
+    if (ec)
         return;  // Exit on injected error
 
     ec = f.maybe_fail();
-    if (ec.failed())
+    if (ec)
         return;
 
     // Success path
@@ -126,12 +126,12 @@ Tests must handle injected errors by returning early:
 ----
 // CORRECT: early return on injected error
 auto [ec, n] = co_await rs.read_some(buf);
-if (ec.failed())
+if (ec)
     co_return;  // fuse injected error, exit gracefully
 
 // WRONG: asserting success unconditionally
 auto [ec, n] = co_await rs.read_some(buf);
-BOOST_TEST(!ec.failed());  // FAILS when fuse injects error!
+BOOST_TEST(!ec);  // FAILS when fuse injects error!
 ----
 
 === Coroutine Support
@@ -147,7 +147,7 @@ rs.provide("test data");
 auto r = f.armed([&](fuse&) -> task<> {
     char buf[32];
     auto [ec, n] = co_await rs.read_some(make_buffer(buf));
-    if (ec.failed())
+    if (ec)
         co_return;
     // Process data...
 });
@@ -184,7 +184,7 @@ public:
     system::error_code do_work()
     {
         auto ec = f_.maybe_fail();  // No-op in production
-        if (ec.failed())
+        if (ec)
             return ec;
         // ... actual work ...
         return {};
@@ -331,7 +331,7 @@ rs.provide("World!");
 auto r = f.armed([&](fuse&) -> task<> {
     char buf[32];
     auto [ec, n] = co_await rs.read_some(make_buffer(buf));
-    if (ec.failed())
+    if (ec)
         co_return;
     // buf contains up to 13 bytes
 });
@@ -400,7 +400,7 @@ write_stream ws(f);
 
 auto r = f.armed([&](fuse&) -> task<> {
     auto [ec, n] = co_await ws.write_some(make_buffer("Hello"));
-    if (ec.failed())
+    if (ec)
         co_return;
 });
 
@@ -467,7 +467,7 @@ rs.provide("Hello, World!");
 auto r = f.armed([&](fuse&) -> task<> {
     char buf[32];
     auto [ec, n] = co_await rs.read(make_buffer(buf));
-    if (ec.failed())
+    if (ec)
         co_return;
     // buf filled completely before returning
 });
@@ -510,10 +510,10 @@ write_sink ws(f);
 
 auto r = f.armed([&](fuse&) -> task<> {
     auto [ec, n] = co_await ws.write(make_buffer("Hello"));
-    if (ec.failed())
+    if (ec)
         co_return;
     auto [ec2] = co_await ws.write_eof();
-    if (ec2.failed())
+    if (ec2)
         co_return;
 });
 
@@ -585,7 +585,7 @@ void test_parser()
         auto r = f.armed([&](fuse&) -> task<> {
             http_request req;
             auto [ec] = co_await parse_request(rs, req);
-            if (ec.failed())
+            if (ec)
                 co_return;
             BOOST_TEST(req.method == "GET");
             BOOST_TEST(req.target == "/");
diff --git a/doc/unlisted/library-when-all.adoc b/doc/unlisted/library-when-all.adoc
index 58aaa85c5..ee9cf8592 100644
--- a/doc/unlisted/library-when-all.adoc
+++ b/doc/unlisted/library-when-all.adoc
@@ -54,7 +54,7 @@ one finishes.
 
 == Return Value
 
-`when_all` returns a tuple of results, with void types filtered out:
+`when_all` returns a tuple of results. Void tasks contribute `std::monostate` to preserve the task-index-to-result-index mapping:
 
 [source,cpp]
 ----
@@ -65,20 +65,20 @@ auto [x, y] = co_await when_all(
 );
 // x is int, y is std::string
 
-// Mixed: void tasks don't contribute
-auto [value] = co_await when_all(
-    returns_int(),  // task<int>
-    returns_void(), // task<void> — no contribution
-    returns_void()  // task<void> — no contribution
+// Mixed: void tasks contribute monostate
+auto [a, b, c] = co_await when_all(
+    returns_int(),  // task<int>       — index 0
+    returns_void(), // task<void>      — index 1 → monostate
+    returns_void()  // task<void>      — index 2 → monostate
 );
-// value is int (only non-void result)
+// a is int, b and c are std::monostate
 
-// All void: returns void
-co_await when_all(
+// All void: returns tuple of monostate
+auto [m, n] = co_await when_all(
     task_void_1(),
     task_void_2()
 );
-// No tuple, no return value
+// m and n are std::monostate
 ----
 
 Results appear in the same order as input tasks.
@@ -244,7 +244,7 @@ task<void> fetch_all(http_client& client)
 | Launch tasks concurrently, wait for all
 
 | Return type
-| Tuple of non-void results in input order
+| Tuple of results in input order (`monostate` for void tasks)
 
 | Error handling
 | First exception propagated, siblings get stop
diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt
index a3c9dc6a9..0d48896dc 100644
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -1,5 +1,6 @@
 #
 # Copyright (c) 2026 Mungo Gill
+# Copyright (c) 2026 Steve Gerbino
 #
 # Distributed under the Boost Software License, Version 1.0. (See accompanying
 # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -7,21 +8,40 @@
 # Official repository: https://github.com/cppalliance/capy
 #
 
-add_subdirectory(hello-task)
-add_subdirectory(producer-consumer)
+add_subdirectory(async-mutex)
 add_subdirectory(buffer-composition)
+add_subdirectory(custom-dynamic-buffer)
+add_subdirectory(custom-executor)
+add_subdirectory(hello-task)
 add_subdirectory(mock-stream-testing)
-add_subdirectory(type-erased-echo)
-add_subdirectory(timeout-cancellation)
 add_subdirectory(parallel-fetch)
-add_subdirectory(custom-dynamic-buffer)
+add_subdirectory(parallel-tasks)
+add_subdirectory(producer-consumer)
+add_subdirectory(quitter-shutdown)
+add_subdirectory(strand-serialization)
 add_subdirectory(stream-pipeline)
+add_subdirectory(timeout-cancellation)
+add_subdirectory(type-erased-echo)
 add_subdirectory(when-any-cancellation)
+if(BOOST_CAPY_BUILD_P2300_EXAMPLES)
+    add_subdirectory(sender-bridge)
+    add_subdirectory(awaitable-sender)
+endif()
+
+if(BOOST_CAPY_BUILD_CUDA_EXAMPLES)
+    add_subdirectory(cuda/datamovement)
+endif()
+
+if(BOOST_CAPY_BUILD_NVEXEC_EXAMPLES)
+    add_subdirectory(cuda/pipeline)
+endif()
+
+add_subdirectory(fabrics)
+
+if(TARGET Boost::asio)
+    add_subdirectory(asio)
+endif()
 
-# Requires Corosio dependency
 if(TARGET Boost::corosio)
     add_subdirectory(echo-server-corosio)
 endif()
-
-add_subdirectory(allocation)
-add_subdirectory(asio)
diff --git a/example/allocation/CMakeLists.txt b/example/allocation/CMakeLists.txt
deleted file mode 100644
index c38fa9583..000000000
--- a/example/allocation/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-#
-# Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
-#
-# Distributed under the Boost Software License, Version 1.0. (See accompanying
-# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-#
-# Official repository: https://github.com/cppalliance/capy
-#
-
-if(BUILD_SHARED_LIBS)
-    include(FetchContent)
-    FetchContent_Declare(mimalloc
-        GIT_REPOSITORY https://github.com/microsoft/mimalloc
-        GIT_TAG v2.2.7
-        GIT_SHALLOW TRUE)
-    set(MI_BUILD_TESTS OFF CACHE BOOL "Disable mimalloc tests" FORCE)
-    FetchContent_MakeAvailable(mimalloc)
-endif()
-
-file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS *.cpp *.hpp
-    CMakeLists.txt
-    Jamfile)
-
-source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
-
-add_executable(capy_example_allocation ${PFILES})
-
-set_property(TARGET capy_example_allocation
-    PROPERTY FOLDER "examples")
-
-target_link_libraries(capy_example_allocation Boost::capy)
-
-if(BUILD_SHARED_LIBS)
-    target_link_libraries(capy_example_allocation mimalloc-static)
-    target_compile_definitions(capy_example_allocation PRIVATE BOOST_CAPY_HAS_MIMALLOC=1)
-endif()
diff --git a/example/allocation/Jamfile b/example/allocation/Jamfile
deleted file mode 100644
index 937f690b5..000000000
--- a/example/allocation/Jamfile
+++ /dev/null
@@ -1,23 +0,0 @@
-#
-# Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
-#
-# Distributed under the Boost Software License, Version 1.0. (See accompanying
-# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-#
-# Official repository: https://github.com/cppalliance/capy
-#
-
-# Requires mimalloc (https://github.com/microsoft/mimalloc)
-# installed where the compiler can find it.
-
-project
-    : requirements
-      <library>/boost/capy//boost_capy
-      <include>.
-    ;
-
-exe allocation :
-    [ glob *.cpp ]
-    :
-    <linkflags>-lmimalloc
-    ;
diff --git a/example/asio/api/capy_streams.cpp b/example/asio/api/capy_streams.cpp
index bcbeb2591..712cb7c95 100644
--- a/example/asio/api/capy_streams.cpp
+++ b/example/asio/api/capy_streams.cpp
@@ -18,7 +18,6 @@
 
 #include <boost/capy/buffers.hpp>
 #include <boost/capy/buffers/asio.hpp>
-#include <boost/capy/buffers/buffer_array.hpp>
 #include <boost/capy/concept/stream.hpp>
 #include <coroutine>
 #include <boost/capy/ex/io_env.hpp>
@@ -83,6 +82,7 @@ class asio_socket
         MB buffers_;
         capy::io_result<std::size_t> result_;
         std::shared_ptr<cancel_state> cancel_;
+        capy::continuation cont_;
 
     public:
         read_awaitable(
@@ -103,19 +103,20 @@ class asio_socket
             std::coroutine_handle<> h,
             capy::io_env const* env)
         {
+            cont_.h = h;
             cancel_ = std::make_shared<cancel_state>(env->stop_token);
 
             self_->socket_.async_read_some(
-                capy::to_asio(capy::mutable_buffer_array<8>(buffers_)),
+                capy::to_asio(buffers_),
                 net::bind_cancellation_slot(
                     cancel_->signal.slot(),
-                    [this, h, ex = env->executor](
+                    [this, ex = env->executor](
                         boost::system::error_code ec,
                         std::size_t n) mutable
                     {
                         result_.ec = ec;
-                        result_.t1 = n;
-                        ex.post(h);
+                        std::get<0>(result_.values) = n;
+                        ex.post(cont_);
                     }));
 
             return std::noop_coroutine();
@@ -145,6 +146,7 @@ class asio_socket
         CB buffers_;
         capy::io_result<std::size_t> result_;
         std::shared_ptr<cancel_state> cancel_;
+        capy::continuation cont_;
 
     public:
         write_awaitable(
@@ -165,19 +167,20 @@ class asio_socket
             std::coroutine_handle<> h,
             capy::io_env const* env)
         {
+            cont_.h = h;
             cancel_ = std::make_shared<cancel_state>(env->stop_token);
 
             self_->socket_.async_write_some(
-                capy::to_asio(capy::const_buffer_array<8>(buffers_)),
+                capy::to_asio(buffers_),
                 net::bind_cancellation_slot(
                     cancel_->signal.slot(),
-                    [this, h, ex = env->executor](
+                    [this, ex = env->executor](
                         boost::system::error_code ec,
                         std::size_t n) mutable
                     {
                         result_.ec = ec;
-                        result_.t1 = n;
-                        ex.post(h);
+                        std::get<0>(result_.values) = n;
+                        ex.post(cont_);
                     }));
 
             return std::noop_coroutine();
@@ -235,14 +238,16 @@ class asio_executor
     void on_work_started() const noexcept {}
     void on_work_finished() const noexcept {}
 
-    std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const
+    std::coroutine_handle<> dispatch(continuation& c) const
     {
+        auto h = c.h;
         net::post(ex_, [h]{ h.resume(); });
         return std::noop_coroutine();
     }
 
-    void post(std::coroutine_handle<> h) const
+    void post(continuation& c) const
     {
+        auto h = c.h;
         net::post(ex_, [h]{ h.resume(); });
     }
 };
diff --git a/example/asio/api/uni_stream.hpp b/example/asio/api/uni_stream.hpp
index b1a2962c9..93964cde1 100644
--- a/example/asio/api/uni_stream.hpp
+++ b/example/asio/api/uni_stream.hpp
@@ -55,14 +55,16 @@ class asio_executor_wrapper
     void on_work_started() const noexcept {}
     void on_work_finished() const noexcept {}
 
-    std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const
+    std::coroutine_handle<> dispatch(continuation& c) const
     {
+        auto h = c.h;
         net::post(ex_, [h]{ h.resume(); });
         return std::noop_coroutine();
     }
 
-    void post(std::coroutine_handle<> h) const
+    void post(continuation& c) const
     {
+        auto h = c.h;
         net::post(ex_, [h]{ h.resume(); });
     }
 };
diff --git a/example/asio/api/use_capy.hpp b/example/asio/api/use_capy.hpp
index 224933058..bc227a767 100644
--- a/example/asio/api/use_capy.hpp
+++ b/example/asio/api/use_capy.hpp
@@ -158,24 +158,24 @@ class capy_awaitable
     void store_result(boost::system::error_code ec, T1&& t1)
     {
         result_.ec = ec;
-        result_.t1 = std::forward<T1>(t1);
+        std::get<0>(result_.values) = std::forward<T1>(t1);
     }
 
     template<typename T1, typename T2>
     void store_result(boost::system::error_code ec, T1&& t1, T2&& t2)
     {
         result_.ec = ec;
-        result_.t1 = std::forward<T1>(t1);
-        result_.t2 = std::forward<T2>(t2);
+        std::get<0>(result_.values) = std::forward<T1>(t1);
+        std::get<1>(result_.values) = std::forward<T2>(t2);
     }
 
     template<typename T1, typename T2, typename T3>
     void store_result(boost::system::error_code ec, T1&& t1, T2&& t2, T3&& t3)
     {
         result_.ec = ec;
-        result_.t1 = std::forward<T1>(t1);
-        result_.t2 = std::forward<T2>(t2);
-        result_.t3 = std::forward<T3>(t3);
+        std::get<0>(result_.values) = std::forward<T1>(t1);
+        std::get<1>(result_.values) = std::forward<T2>(t2);
+        std::get<2>(result_.values) = std::forward<T3>(t3);
     }
 };
 
diff --git a/example/async-mutex/CMakeLists.txt b/example/async-mutex/CMakeLists.txt
new file mode 100644
index 000000000..3cda678b7
--- /dev/null
+++ b/example/async-mutex/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2026 Mungo Gill
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS *.cpp *.hpp
+    CMakeLists.txt
+    Jamfile)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_async_mutex ${PFILES})
+
+set_property(TARGET capy_example_async_mutex
+    PROPERTY FOLDER "examples")
+
+target_link_libraries(capy_example_async_mutex
+    Boost::capy)
diff --git a/example/async-mutex/async_mutex.cpp b/example/async-mutex/async_mutex.cpp
new file mode 100644
index 000000000..c65c593c5
--- /dev/null
+++ b/example/async-mutex/async_mutex.cpp
@@ -0,0 +1,89 @@
+//
+// Copyright (c) 2026 Mungo Gill
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Async Mutex Example
+//
+// Demonstrates async_mutex for fair FIFO coroutine locking.
+// Multiple worker coroutines compete for a shared resource;
+// the mutex ensures mutual exclusion and FIFO acquisition order.
+//
+
+#include <boost/capy.hpp>
+#include <iostream>
+#include <latch>
+#include <vector>
+
+namespace capy = boost::capy;
+
+int main()
+{
+    capy::thread_pool pool;
+    capy::strand s{pool.get_executor()};
+    std::latch done(1);
+
+    auto on_complete = [&done](auto&&...) { done.count_down(); };
+    auto on_error = [&done](std::exception_ptr ep) {
+        try { std::rethrow_exception(ep); }
+        catch (std::exception const& e) {
+            std::cerr << "Error: " << e.what() << "\n";
+        }
+        catch (...) {
+            std::cerr << "Error: unknown exception\n";
+        }
+        done.count_down();
+    };
+
+    capy::async_mutex mtx;
+    int acquisition_order = 0;
+    std::vector<int> order_log;
+
+    auto worker = [&](int id) -> capy::io_task<> {
+        std::cout << "Worker " << id << " waiting for lock\n";
+        auto [ec, guard] = co_await mtx.scoped_lock();
+        if (ec)
+        {
+            std::cout << "Worker " << id
+                      << " canceled: " << ec.message() << "\n";
+            co_return capy::io_result<>{ec};
+        }
+
+        int seq = acquisition_order++;
+        order_log.push_back(id);
+        std::cout << "Worker " << id
+                  << " acquired lock (sequence " << seq << ")\n";
+
+        std::cout << "Worker " << id << " releasing lock\n";
+        co_return capy::io_result<>{};
+    };
+
+    auto run_all = [&]() -> capy::task<> {
+        auto r = co_await capy::when_all(
+            worker(0), worker(1), worker(2),
+            worker(3), worker(4), worker(5));
+        if(r.ec)
+            std::cerr << "when_all error: "
+                      << r.ec.message() << "\n";
+    };
+
+    // Run on a strand so async_mutex operations are single-threaded
+    capy::run_async(s, on_complete, on_error)(run_all());
+    done.wait();
+
+    std::cout << "\nAcquisition order: ";
+    for (std::size_t i = 0; i < order_log.size(); ++i)
+    {
+        if (i > 0)
+            std::cout << " -> ";
+        std::cout << "W" << order_log[i];
+    }
+    std::cout << "\n";
+
+    return 0;
+}
diff --git a/example/awaitable-sender/CMakeLists.txt b/example/awaitable-sender/CMakeLists.txt
new file mode 100644
index 000000000..d4561416f
--- /dev/null
+++ b/example/awaitable-sender/CMakeLists.txt
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+include(FetchContent)
+
+FetchContent_Declare(
+    execution
+    GIT_REPOSITORY https://github.com/bemanproject/execution.git
+    GIT_TAG main
+    SYSTEM
+)
+FetchContent_MakeAvailable(execution)
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS *.cpp *.hpp
+    CMakeLists.txt
+    Jamfile)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_awaitable_sender ${PFILES})
+
+set_property(TARGET capy_example_awaitable_sender
+    PROPERTY FOLDER "examples")
+
+target_compile_features(capy_example_awaitable_sender
+    PRIVATE cxx_std_23)
+
+target_link_libraries(capy_example_awaitable_sender
+    Boost::capy
+    beman::execution_headers)
diff --git a/example/awaitable-sender/awaitable_sender.cpp b/example/awaitable-sender/awaitable_sender.cpp
new file mode 100644
index 000000000..fdeaf685e
--- /dev/null
+++ b/example/awaitable-sender/awaitable_sender.cpp
@@ -0,0 +1,162 @@
+//
+// Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#include "awaitable_sender.hpp"
+
+#include <boost/capy.hpp>
+
+#include <beman/execution/execution.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <latch>
+#include <stop_token>
+#include <system_error>
+#include <thread>
+
+namespace capy = boost::capy;
+namespace ex = beman::execution;
+
+// A receiver whose environment carries a Capy executor.
+// Completion signals a latch so main() can wait.
+struct demo_receiver
+{
+    using receiver_concept = ex::receiver_t;
+
+    capy::io_sender_env env_;
+    std::latch* done_;
+
+    auto get_env() const noexcept -> capy::io_sender_env
+    {
+        return env_;
+    }
+
+    void set_value() && noexcept
+    {
+        std::cout
+            << "  set_value on thread "
+            << std::this_thread::get_id() << "\n";
+        done_->count_down();
+    }
+
+    void set_error(std::error_code ec) && noexcept
+    {
+        std::cerr << "  error: " << ec.message() << "\n";
+        done_->count_down();
+    }
+
+    void set_error(std::exception_ptr ep) && noexcept
+    {
+        try { std::rethrow_exception(ep); }
+        catch (std::exception const& e) {
+            std::cerr << "  error: " << e.what() << "\n";
+        }
+        done_->count_down();
+    }
+
+    void set_stopped() && noexcept
+    {
+        std::cout << "  stopped\n";
+        done_->count_down();
+    }
+};
+
+int main()
+{
+    using namespace std::chrono_literals;
+
+    std::cout
+        << "main thread: "
+        << std::this_thread::get_id() << "\n";
+
+    // Capy execution context (provides timer service, etc.)
+    capy::thread_pool pool;
+
+    std::latch done(1);
+
+    // Build a sender from a Capy IoAwaitable
+    auto sndr = capy::as_sender(capy::delay(500ms));
+
+    // Connect with a receiver whose environment carries
+    // the Capy thread_pool executor
+    auto op = ex::connect(
+        std::move(sndr),
+        demo_receiver{
+            {pool.get_executor(), std::stop_token{}},
+            &done});
+
+    std::cout << "  starting delay...\n";
+    ex::start(op);
+
+    done.wait();
+    std::cout << "  delay completed\n";
+
+    // Test cancellation via stop token
+    std::cout << "\n--- cancellation test ---\n";
+    std::stop_source ss;
+    std::latch done2(1);
+
+    auto sndr2 = capy::as_sender(capy::delay(5s));
+    auto op2 = ex::connect(
+        std::move(sndr2),
+        demo_receiver{
+            {pool.get_executor(), ss.get_token()},
+            &done2});
+
+    std::cout << "  starting 5s delay...\n";
+    ex::start(op2);
+
+    std::this_thread::sleep_for(100ms);
+    std::cout << "  requesting stop...\n";
+    ss.request_stop();
+
+    done2.wait();
+    std::cout << "  cancellation test done\n";
+
+    // Test split_ec with success (error_code == 0)
+    std::cout << "\n--- split_ec success test ---\n";
+    std::latch done3(1);
+
+    auto sndr3 = capy::split_ec(
+        capy::as_sender(capy::delay(100ms)));
+    auto op3 = ex::connect(
+        std::move(sndr3),
+        demo_receiver{
+            {pool.get_executor(), std::stop_token{}},
+            &done3});
+
+    ex::start(op3);
+    done3.wait();
+    std::cout << "  split_ec success test done\n";
+
+    // Test split_ec with error (error_code != 0)
+    std::cout << "\n--- split_ec error test ---\n";
+    std::latch done4(1);
+
+    auto make_ec_sender = [&pool]() {
+        auto task = [](capy::executor_ref)
+            -> capy::task<std::error_code>
+        {
+            co_return std::make_error_code(
+                std::errc::connection_reset);
+        }(pool.get_executor());
+        return capy::as_sender(std::move(task));
+    };
+
+    auto sndr4 = capy::split_ec(make_ec_sender());
+    auto op4 = ex::connect(
+        std::move(sndr4),
+        demo_receiver{
+            {pool.get_executor(), std::stop_token{}},
+            &done4});
+
+    ex::start(op4);
+    done4.wait();
+    std::cout << "  split_ec error test done\n";
+}
diff --git a/example/awaitable-sender/awaitable_sender.hpp b/example/awaitable-sender/awaitable_sender.hpp
new file mode 100644
index 000000000..0e3fb9852
--- /dev/null
+++ b/example/awaitable-sender/awaitable_sender.hpp
@@ -0,0 +1,499 @@
+//
+// Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_EXAMPLE_AWAITABLE_SENDER_HPP
+#define BOOST_CAPY_EXAMPLE_AWAITABLE_SENDER_HPP
+
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/detail/await_suspend_helper.hpp>
+#include <boost/capy/ex/executor_ref.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <beman/execution/execution.hpp>
+
+#include <concepts>
+#include <coroutine>
+#include <exception>
+#include <stop_token>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace boost::capy {
+
+// -------------------------------------------------------
+// CPO: query a receiver environment for a Capy executor
+// -------------------------------------------------------
+
+struct get_io_executor_t
+{
+    template<class Env>
+    auto operator()(Env const& env) const noexcept
+        -> decltype(env.query(std::declval<get_io_executor_t const&>()))
+    {
+        return env.query(*this);
+    }
+};
+
+inline constexpr get_io_executor_t get_io_executor{};
+
+// -------------------------------------------------------
+// Environment that carries a Capy executor + stop token
+// -------------------------------------------------------
+
+struct io_sender_env
+{
+    executor_ref io_executor;
+    std::stop_token stop_token;
+
+    auto query(
+        get_io_executor_t const&) const noexcept
+        -> executor_ref
+    {
+        return io_executor;
+    }
+
+    auto query(
+        beman::execution::get_stop_token_t const&) const noexcept
+        -> std::stop_token
+    {
+        return stop_token;
+    }
+};
+
+namespace detail {
+
+template<class T, class = void>
+struct has_tuple_protocol : std::false_type {};
+
+template<class T>
+struct has_tuple_protocol<T,
+    std::void_t<
+        typename std::tuple_size<T>::type,
+        typename std::tuple_element<0, T>::type>>
+    : std::true_type {};
+
+template<class T, bool = has_tuple_protocol<T>::value>
+struct is_ec_outcome : std::is_same<T, std::error_code> {};
+
+template<class T>
+struct is_ec_outcome<T, true>
+    : std::bool_constant<
+        std::tuple_size_v<T> == 1 &&
+        std::is_same_v<
+            std::tuple_element_t<0, T>,
+            std::error_code>>
+{};
+
+template<class T>
+constexpr bool is_ec_outcome_v =
+    std::is_same_v<T, std::error_code> ||
+    is_ec_outcome<T>::value;
+
+template<class T, bool = has_tuple_protocol<T>::value>
+struct is_compound_ec_result : std::false_type {};
+
+template<class T>
+struct is_compound_ec_result<T, true>
+    : std::bool_constant<
+        std::tuple_size_v<T> >= 2 &&
+        std::is_same_v<
+            std::tuple_element_t<0, T>,
+            std::error_code>>
+{};
+
+template<class T>
+constexpr bool is_compound_ec_result_v =
+    is_compound_ec_result<T>::value;
+
+// -------------------------------------------------------
+// frame_cb: synthetic coroutine frame for callback handles
+//
+// The first two members match the coroutine frame layout
+// used by MSVC, GCC, and Clang. from_address produces a
+// coroutine_handle whose .resume() calls our function
+// pointer and whose .destroy() is a no-op.
+// -------------------------------------------------------
+
+struct frame_cb
+{
+    void (*resume)(frame_cb*);
+    void (*destroy)(frame_cb*);
+    void* data;
+};
+
+} // namespace detail
+
+// -------------------------------------------------------
+// Sender that wraps an IoAwaitable
+// -------------------------------------------------------
+
+template<class IoAw>
+struct awaitable_sender
+{
+    using sender_concept = beman::execution::sender_t;
+
+    using result_type = decltype(
+        std::declval<std::decay_t<IoAw>&>().await_resume());
+
+    static auto make_sigs()
+    {
+        if constexpr (std::is_void_v<result_type>)
+            return beman::execution::completion_signatures<
+                beman::execution::set_value_t(),
+                beman::execution::set_error_t(std::exception_ptr),
+                beman::execution::set_stopped_t()>{};
+        else if constexpr (
+            detail::is_ec_outcome_v<result_type>)
+            return beman::execution::completion_signatures<
+                beman::execution::set_value_t(),
+                beman::execution::set_error_t(std::error_code),
+                beman::execution::set_error_t(std::exception_ptr),
+                beman::execution::set_stopped_t()>{};
+        else
+            return beman::execution::completion_signatures<
+                beman::execution::set_value_t(result_type),
+                beman::execution::set_error_t(std::exception_ptr),
+                beman::execution::set_stopped_t()>{};
+    }
+
+    using completion_signatures = decltype(make_sigs());
+
+    IoAw aw_;
+
+    template<class Receiver>
+    struct op_state
+    {
+        using operation_state_concept =
+            beman::execution::operation_state_t;
+
+        IoAw aw_;
+        Receiver rcvr_;
+        io_env env_;
+        detail::frame_cb cb_;
+
+        op_state(IoAw aw, Receiver rcvr)
+            : aw_(std::move(aw))
+            , rcvr_(std::move(rcvr))
+            , cb_{}
+        {
+        }
+
+        op_state(op_state const&) = delete;
+        op_state(op_state&&) = delete;
+        op_state& operator=(op_state const&) = delete;
+        op_state& operator=(op_state&&) = delete;
+
+        static void
+        on_resume(detail::frame_cb* p) noexcept
+        {
+            auto* self = static_cast<op_state*>(p->data);
+            self->complete();
+        }
+
+        static void
+        on_destroy(detail::frame_cb*) noexcept
+        {
+        }
+
+        void complete() noexcept
+        {
+            try
+            {
+                if constexpr (std::is_void_v<result_type>)
+                {
+                    aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                        beman::execution::set_stopped(
+                            std::move(rcvr_));
+                    else
+                        beman::execution::set_value(
+                            std::move(rcvr_));
+                }
+                else if constexpr (
+                    detail::is_ec_outcome_v<result_type>)
+                {
+                    auto result = aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                    {
+                        beman::execution::set_stopped(
+                            std::move(rcvr_));
+                    }
+                    else
+                    {
+                        std::error_code ec;
+                        if constexpr (std::is_same_v<
+                            result_type, std::error_code>)
+                            ec = result;
+                        else
+                            ec = get<0>(result);
+                        if(!ec)
+                            beman::execution::set_value(
+                                std::move(rcvr_));
+                        else
+                            beman::execution::set_error(
+                                std::move(rcvr_), ec);
+                    }
+                }
+                else
+                {
+                    auto result = aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                        beman::execution::set_stopped(
+                            std::move(rcvr_));
+                    else
+                        beman::execution::set_value(
+                            std::move(rcvr_),
+                            std::move(result));
+                }
+            }
+            catch(...)
+            {
+                beman::execution::set_error(
+                    std::move(rcvr_),
+                    std::current_exception());
+            }
+        }
+
+        void start() noexcept
+        {
+            auto renv = beman::execution::get_env(rcvr_);
+            auto ex = get_io_executor(renv);
+
+            std::stop_token st;
+            if constexpr (requires {
+                { renv.query(beman::execution::get_stop_token_t{}) }
+                    -> std::convertible_to<std::stop_token>; })
+            {
+                st = renv.query(
+                    beman::execution::get_stop_token_t{});
+            }
+
+            env_ = io_env{ex, st, nullptr};
+
+            if(aw_.await_ready())
+            {
+                complete();
+                return;
+            }
+
+            cb_.resume = &on_resume;
+            cb_.destroy = &on_destroy;
+            cb_.data = this;
+
+            auto h = std::coroutine_handle<>::from_address(
+                static_cast<void*>(&cb_));
+
+            detail::call_await_suspend(&aw_, h, &env_);
+        }
+    };
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) &&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            std::move(aw_), std::move(rcvr));
+    }
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) const&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(aw_, std::move(rcvr));
+    }
+};
+
+/** Create a beman::execution sender from an IoAwaitable.
+
+    The bridge routes the awaitable's result through sender
+    channels based on its type:
+
+    - `void` - calls `set_value()`.
+    - `error_code` (or a single-element tuple-like whose
+      element 0 is `error_code`) - calls `set_value()`
+      when the code is zero, `set_error(ec)` otherwise.
+    - Any other single value `T` - calls `set_value(T)`.
+    - Compound results whose element 0 is `error_code`
+      with additional elements are rejected at compile
+      time. Wrap the operation in a `task<error_code>`
+      that inspects the compound result and returns the
+      error code.
+
+    @par Example
+    @code
+    auto sndr = as_sender(capy::delay(
+        std::chrono::milliseconds(100)));
+    @endcode
+
+    @param aw The IoAwaitable to wrap.
+    @return A sender whose completion channels reflect
+        the awaitable's result type.
+*/
+template<class IoAw>
+auto as_sender(IoAw&& aw)
+{
+    using R = decltype(
+        std::declval<std::decay_t<IoAw>&>().await_resume());
+    static_assert(
+        !detail::is_compound_ec_result_v<std::decay_t<R>>,
+        "as_sender does not accept awaitables whose result "
+        "destructures into (error_code, ...). Wrap the "
+        "operation in a task<error_code> that inspects the "
+        "compound result and returns the error code.");
+    return awaitable_sender<std::decay_t<IoAw>>{
+        std::forward<IoAw>(aw)};
+}
+
+// -------------------------------------------------------
+// split_ec: sender adapter that routes error_code to
+// set_value() or set_error(ec) at runtime.
+// -------------------------------------------------------
+
+namespace detail {
+
+template<class Sender>
+struct split_ec_sender
+{
+    using sender_concept = beman::execution::sender_t;
+
+    using completion_signatures =
+        beman::execution::completion_signatures<
+            beman::execution::set_value_t(),
+            beman::execution::set_error_t(std::error_code),
+            beman::execution::set_error_t(std::exception_ptr),
+            beman::execution::set_stopped_t()>;
+
+    Sender sndr_;
+
+    template<class Receiver>
+    struct ec_receiver
+    {
+        using receiver_concept = beman::execution::receiver_t;
+
+        Receiver rcvr_;
+
+        auto get_env() const noexcept
+        {
+            return beman::execution::get_env(rcvr_);
+        }
+
+        void set_value(std::error_code ec) && noexcept
+        {
+            if (!ec)
+                beman::execution::set_value(
+                    std::move(rcvr_));
+            else
+                beman::execution::set_error(
+                    std::move(rcvr_), ec);
+        }
+
+        void set_value() && noexcept
+        {
+            beman::execution::set_value(
+                std::move(rcvr_));
+        }
+
+        template<class E>
+        void set_error(E&& e) && noexcept
+        {
+            beman::execution::set_error(
+                std::move(rcvr_),
+                std::forward<E>(e));
+        }
+
+        void set_stopped() && noexcept
+        {
+            beman::execution::set_stopped(
+                std::move(rcvr_));
+        }
+    };
+
+    template<class Receiver>
+    struct op_state
+    {
+        using operation_state_concept =
+            beman::execution::operation_state_t;
+
+        using inner_op_t = decltype(
+            beman::execution::connect(
+                std::declval<Sender>(),
+                std::declval<ec_receiver<Receiver>>()));
+
+        inner_op_t op_;
+
+        op_state(Sender sndr, Receiver rcvr)
+            : op_(beman::execution::connect(
+                std::move(sndr),
+                ec_receiver<Receiver>{std::move(rcvr)}))
+        {
+        }
+
+        op_state(op_state const&) = delete;
+        op_state(op_state&&) = delete;
+        op_state& operator=(op_state const&) = delete;
+        op_state& operator=(op_state&&) = delete;
+
+        void start() noexcept
+        {
+            beman::execution::start(op_);
+        }
+    };
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) &&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            std::move(sndr_), std::move(rcvr));
+    }
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) const&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            sndr_, std::move(rcvr));
+    }
+};
+
+} // namespace detail
+
+/** Split an `error_code` value channel into success and error channels.
+
+    Takes a sender that completes with `set_value(error_code)` and
+    routes it at runtime: `set_value()` when the code is zero,
+    `set_error(ec)` otherwise. No exceptions.
+
+    @par Example
+    @code
+    do_read(sock, buf)
+        | split_ec()
+        | ex::upon_error(
+            [](std::error_code ec) {
+                // reachable, no exceptions
+            });
+    @endcode
+
+    @param sndr The predecessor sender.
+    @return A sender completing with `set_value()`,
+        `set_error(error_code)`, or `set_stopped()`.
+*/
+template<class Sender>
+auto split_ec(Sender&& sndr)
+{
+    return detail::split_ec_sender<
+        std::decay_t<Sender>>{
+            std::forward<Sender>(sndr)};
+}
+
+} // namespace boost::capy
+
+#endif
diff --git a/example/buffer-composition/buffer_composition.cpp b/example/buffer-composition/buffer_composition.cpp
index 8f39e6c92..554a8ae03 100644
--- a/example/buffer-composition/buffer_composition.cpp
+++ b/example/buffer-composition/buffer_composition.cpp
@@ -13,7 +13,7 @@
 #include <array>
 #include <vector>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 void demonstrate_single_buffers()
 {
@@ -25,32 +25,32 @@ void demonstrate_single_buffers()
     std::vector<char> vec = {'V', 'e', 'c', 't', 'o', 'r'};
     
     // make_buffer creates buffer views (no copies)
-    auto str_buf = make_buffer(str);  // mutable_buffer
-    auto arr_buf = make_buffer(arr, sizeof(arr) - 1);  // mutable_buffer - Exclude null terminator
-    auto vec_buf = make_buffer(vec);  // mutable_buffer
+    auto str_buf = capy::make_buffer(str);  // mutable_buffer
+    auto arr_buf = capy::make_buffer(arr, sizeof(arr) - 1);  // mutable_buffer - Exclude null terminator
+    auto vec_buf = capy::make_buffer(vec);  // mutable_buffer
     
     std::cout << "String buffer: " << str_buf.size() << " bytes\n";
     std::cout << "Array buffer:  " << arr_buf.size() << " bytes\n";
     std::cout << "Vector buffer: " << vec_buf.size() << " bytes\n";
 }
 
-void demonstrate_buffer_pair()
+void demonstrate_two_buffer_scatter()
 {
-    std::cout << "\n=== Buffer Pair (Scatter/Gather) ===\n\n";
+    std::cout << "\n=== Two-Buffer Scatter/Gather ===\n\n";
     
-    // const_buffer_pair is std::array<const_buffer, 2>
+    // Two-element scatter/gather sequence (just a std::array)
     std::string header = "Content-Type: text/plain\r\n\r\n";
     std::string body = "Hello, World!";
-    
-    const_buffer_pair message = {{
-        make_buffer(header),
-        make_buffer(body)
+
+    std::array<capy::const_buffer, 2> message = {{
+        capy::make_buffer(header),
+        capy::make_buffer(body)
     }};
     
     // Calculate total size
-    std::size_t total = buffer_size(message);
+    std::size_t total = capy::buffer_size(message);
     std::cout << "Total message size: " << total << " bytes\n";
-    std::cout << "Buffer count: " << buffer_length(message) << "\n";
+    std::cout << "Buffer count: " << capy::buffer_length(message) << "\n";
     
     // Iterate through buffers
     std::cout << "\nBuffer contents:\n";
@@ -73,17 +73,17 @@ void demonstrate_buffer_array()
     std::string empty_line = "\r\n";
     std::string body = R"({"status":"ok"})";
     
-    std::array<const_buffer, 5> http_response = {{
-        make_buffer(status),
-        make_buffer(content_type),
-        make_buffer(server),
-        make_buffer(empty_line),
-        make_buffer(body)
+    std::array<capy::const_buffer, 5> http_response = {{
+        capy::make_buffer(status),
+        capy::make_buffer(content_type),
+        capy::make_buffer(server),
+        capy::make_buffer(empty_line),
+        capy::make_buffer(body)
     }};
     
-    std::size_t total = buffer_size(http_response);
+    std::size_t total = capy::buffer_size(http_response);
     std::cout << "HTTP response size: " << total << " bytes\n";
-    std::cout << "Buffer count: " << buffer_length(http_response) << "\n";
+    std::cout << "Buffer count: " << capy::buffer_length(http_response) << "\n";
     
     // In real code with streams:
     // co_await write(stream, http_response);
@@ -98,13 +98,13 @@ void demonstrate_mutable_buffers()
     char buf1[64];
     char buf2[64];
     
-    mutable_buffer_pair recv_buffers = {{
-        mutable_buffer(buf1, sizeof(buf1)),
-        mutable_buffer(buf2, sizeof(buf2))
+    std::array<capy::mutable_buffer, 2> recv_buffers = {{
+        capy::mutable_buffer(buf1, sizeof(buf1)),
+        capy::mutable_buffer(buf2, sizeof(buf2))
     }};
     
-    std::cout << "Prepared " << buffer_length(recv_buffers) 
-              << " buffers with " << buffer_size(recv_buffers) 
+    std::cout << "Prepared " << capy::buffer_length(recv_buffers) 
+              << " buffers with " << capy::buffer_size(recv_buffers) 
               << " bytes total capacity\n";
     
     // In real code:
@@ -114,7 +114,7 @@ void demonstrate_mutable_buffers()
 int main()
 {
     demonstrate_single_buffers();
-    demonstrate_buffer_pair();
+    demonstrate_two_buffer_scatter();
     demonstrate_buffer_array();
     demonstrate_mutable_buffers();
     
diff --git a/example/cuda/datamovement/CMakeLists.txt b/example/cuda/datamovement/CMakeLists.txt
new file mode 100644
index 000000000..ec6262537
--- /dev/null
+++ b/example/cuda/datamovement/CMakeLists.txt
@@ -0,0 +1,52 @@
+#
+# Copyright (c) 2026 Steve Gerbino
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+# CUDA was enabled at the top level when the option was flipped on.
+if(NOT CMAKE_CUDA_COMPILER)
+    message(FATAL_ERROR
+        "example/cuda/datamovement requires CUDA; "
+        "did you set BOOST_CAPY_BUILD_CUDA_EXAMPLES?")
+endif()
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
+    *.cu *.cuh *.hpp
+    CMakeLists.txt
+    README.md)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_cuda_datamovement ${PFILES})
+
+set_target_properties(capy_example_cuda_datamovement PROPERTIES
+    FOLDER "examples"
+    CUDA_STANDARD 20
+    CUDA_STANDARD_REQUIRED ON
+    CUDA_SEPARABLE_COMPILATION OFF)
+
+target_compile_features(capy_example_cuda_datamovement PRIVATE cxx_std_20)
+
+target_link_libraries(capy_example_cuda_datamovement PRIVATE
+    Boost::capy
+    CUDA::cudart)
+
+# The NCCL interop snippet compiles only when NCCL is available.
+# Without it, the rest of the example still builds.
+find_path(CAPY_NCCL_INCLUDE_DIR nccl.h)
+find_library(CAPY_NCCL_LIBRARY nccl)
+if(CAPY_NCCL_INCLUDE_DIR AND CAPY_NCCL_LIBRARY)
+    target_include_directories(capy_example_cuda_datamovement PRIVATE
+        ${CAPY_NCCL_INCLUDE_DIR})
+    target_link_libraries(capy_example_cuda_datamovement PRIVATE
+        ${CAPY_NCCL_LIBRARY})
+    target_compile_definitions(capy_example_cuda_datamovement PRIVATE
+        CAPY_EXAMPLE_HAS_NCCL=1)
+    message(STATUS "cuda/datamovement: NCCL found; building NCCL interop")
+else()
+    message(STATUS "cuda/datamovement: NCCL not found; skipping NCCL interop")
+endif()
diff --git a/example/cuda/datamovement/README.md b/example/cuda/datamovement/README.md
new file mode 100644
index 000000000..e645722cd
--- /dev/null
+++ b/example/cuda/datamovement/README.md
@@ -0,0 +1,62 @@
+# CUDA data-movement example (P4251R0)
+
+Validation that the CUDA data-movement listings from
+P4251R0 "IoAwaitables for GPU Data Movement" are type-correct against the
+real `boost::capy` API and CUDA. The paper flags this code as AI-generated
+and unverified; this target proves it compiles. Nothing here is executed
+at runtime.
+
+What is validated:
+
+- `cuda_stream_awaiter`: the io_env-less baseline. Asserted to be a
+  standard awaitable but **not** an `IoAwaitable`.
+- `cuda_stream`: `memcpy_h2d` / `memcpy_d2h` / `synchronize` return
+  `IoAwaitable`s.
+- NCCL interop: `ncclAllReduce` on `cuda_stream::native_handle()`
+  followed by `co_await synchronize()`. Built only when NCCL is found at
+  configure time.
+- `cuda_device_stream`: satisfies `WriteStream`, type-erases behind
+  `any_write_stream`, and the `ingest()` protocol handler compiles once
+  against both a GPU stream and an in-memory transport.
+- CUDA Graphs (`cuda_graphs.cu`): a captured graph is replayed inside
+  a coroutine that drives `cuda_stream` memcpy / synchronize.
+
+The non-GPU listings (the byte-oriented compound result and the
+RDMA/libfabric/UCX signatures) do not need CUDA and live in the sibling
+`example/fabrics` example. The sender bridge is in `example/cuda/pipeline`.
+
+## Prerequisites
+
+- NVIDIA GPU and driver visible to `nvidia-smi`.
+- CUDA toolkit (13.x works). On Arch: `pacman -S cuda`.
+- clang as host and CUDA compiler (verified with clang 22).
+- `CMAKE_CXX_STANDARD=20`.
+
+## Building
+
+```
+CXX=clang++ cmake -S . -B build-cuda -G Ninja \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_CXX_STANDARD=20 \
+    -DBOOST_CAPY_BUILD_CUDA_EXAMPLES=ON \
+    -DCMAKE_CUDA_COMPILER=clang++ \
+    -DCMAKE_CUDA_HOST_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=89 \
+    -DCUDAToolkit_ROOT=/opt/cuda
+cmake --build build-cuda --config Release --target capy_example_cuda_datamovement
+```
+
+Replace `89` with your GPU's compute capability
+(`nvidia-smi --query-gpu=compute_cap --format=csv,noheader`).
+
+A clean build is the pass condition; the binary need not be run.
+
+## Scope
+
+No runtime execution and no multi-device topologies. A clean
+build with every `static_assert` holding is the whole deliverable. The
+NCCL snippet builds only when NCCL is found. NVSHMEM (a GPU member of the
+paper's HPC-fabric list) is not verified: `nvshmem_int_put` is device-side
+and its headers do not compile under clang-cuda (capy requires clang-cuda,
+since nvcc lacks C++20 coroutines). The non-GPU fabric signatures live in
+`example/fabrics`, and the sender bridge in `example/cuda/pipeline`.
diff --git a/example/cuda/datamovement/cuda_datamovement.cu b/example/cuda/datamovement/cuda_datamovement.cu
new file mode 100644
index 000000000..569242798
--- /dev/null
+++ b/example/cuda/datamovement/cuda_datamovement.cu
@@ -0,0 +1,92 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#include "cuda_datamovement.hpp"
+
+#include <boost/capy.hpp>
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/test/write_stream.hpp>
+
+#include <cstddef>
+#include <span>
+#include <system_error>
+#include <type_traits>
+#include <utility>
+
+namespace capy = boost::capy;
+namespace ex   = capy::example;
+
+// Intentionally io_env-less: a standard awaitable, not an IoAwaitable.
+static_assert(! capy::IoAwaitable<ex::cuda_stream_awaiter>);
+
+// The data-movement awaitables depend on this helper, which the paper
+// references but never defines.
+static_assert(std::is_same_v<
+    decltype(ex::make_cuda_error(cudaSuccess)), std::error_code>);
+
+// The memcpy member functions return IoAwaitables.
+static_assert(capy::IoAwaitable<
+    decltype(std::declval<ex::cuda_stream&>().memcpy_h2d(
+        nullptr, nullptr, std::size_t{0}))>);
+static_assert(capy::IoAwaitable<
+    decltype(std::declval<ex::cuda_stream&>().memcpy_d2h(
+        nullptr, nullptr, std::size_t{0}))>);
+static_assert(capy::IoAwaitable<
+    decltype(std::declval<ex::cuda_stream&>().synchronize())>);
+
+// GPU device memory satisfies WriteStream and type-erases with zero
+// per-operation allocation.
+static_assert(capy::WriteStream<ex::cuda_device_stream>);
+
+// A protocol handler compiled once, linked against any transport.
+capy::task<>
+ingest(capy::any_write_stream& dest, std::span<std::byte const> data)
+{
+    auto [ec, n] = co_await dest.write_some(
+        capy::make_buffer(data.data(), data.size()));
+    if(ec)
+        co_return;
+    // ...protocol logic...
+}
+
+// Reference ingest against two transports to force the "one .o, many
+// transports" claim to compile. Never executed.
+[[maybe_unused]] void
+link_check()
+{
+    ex::cuda_device_stream gpu(nullptr, nullptr);
+    capy::any_write_stream gpu_dest(&gpu);     // GPU device memory
+
+    capy::test::write_stream mem;
+    capy::any_write_stream mem_dest(&mem);     // in-memory transport
+
+    std::byte payload[8]{};
+    (void) ingest(gpu_dest, payload);
+    (void) ingest(mem_dest, payload);
+}
+
+#if defined(CAPY_EXAMPLE_HAS_NCCL)
+#include <nccl.h>
+
+// NCCL interop: a collective enqueues onto the CUDA stream, then
+// synchronize() awaits its completion through the same IoAwaitable path.
+capy::task<>
+all_reduce(ex::cuda_stream& cs, ncclComm_t comm,
+    float const* sendbuf, float* recvbuf, std::size_t count)
+{
+    ncclAllReduce(sendbuf, recvbuf, count, ncclFloat, ncclSum,
+        comm, cs.native_handle());
+    co_await cs.synchronize();
+}
+#endif
+
+int main()
+{
+    return 0;
+}
diff --git a/example/cuda/datamovement/cuda_datamovement.hpp b/example/cuda/datamovement/cuda_datamovement.hpp
new file mode 100644
index 000000000..4855cd675
--- /dev/null
+++ b/example/cuda/datamovement/cuda_datamovement.hpp
@@ -0,0 +1,353 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_EXAMPLE_CUDA_DATAMOVEMENT_HPP
+#define BOOST_CAPY_EXAMPLE_CUDA_DATAMOVEMENT_HPP
+
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/ex/executor_ref.hpp>
+#include <boost/capy/io_result.hpp>
+#include <boost/capy/buffers.hpp>
+#include <boost/capy/concept/const_buffer_sequence.hpp>
+
+#include <cuda_runtime.h>
+
+#include <coroutine>
+#include <cstddef>
+#include <string>
+#include <system_error>
+#include <utility>
+
+namespace boost {
+namespace capy {
+namespace example {
+
+/// Error category for `cudaError_t` values.
+class cuda_error_category
+    : public std::error_category
+{
+public:
+    char const* name() const noexcept override
+    {
+        return "cuda";
+    }
+
+    std::string message(int ev) const override
+    {
+        return ::cudaGetErrorString(static_cast<cudaError_t>(ev));
+    }
+};
+
+/// Return the singleton CUDA error category.
+inline std::error_category const& cuda_category() noexcept
+{
+    static cuda_error_category const cat;
+    return cat;
+}
+
+/// Convert a `cudaError_t` to a `std::error_code`.
+inline std::error_code make_cuda_error(cudaError_t e) noexcept
+{
+    return std::error_code(static_cast<int>(e), cuda_category());
+}
+
+/// A minimal hand-rolled CUDA-completion awaitable (no executor
+/// affinity, cancellation, or frame allocator). Resumes on the CUDA
+/// driver callback thread.
+struct cuda_stream_awaiter
+{
+    cudaStream_t stream;
+
+    bool await_ready() const noexcept
+    {
+        return false;
+    }
+
+    void await_suspend(std::coroutine_handle<> h)
+    {
+        cudaLaunchHostFunc(stream,
+            [](void* data)
+            {
+                std::coroutine_handle<>::from_address(data).resume();
+            },
+            h.address());
+    }
+
+    void await_resume() noexcept
+    {
+    }
+};
+
+/// A CUDA stream whose data-movement operations are IoAwaitables.
+///
+/// `memcpy_h2d`/`memcpy_d2h` issue a `cudaMemcpyAsync` and resume the
+/// awaiting coroutine on `env->executor` when the stream's
+/// `cudaLaunchHostFunc` callback fires. One operation is in flight per
+/// stream at a time, so the resume context is a pre-allocated member
+/// rather than a per-operation allocation.
+class cuda_stream
+{
+    cudaStream_t stream_ = nullptr;
+    continuation cont_;
+    std::error_code error_;
+
+    struct resume_ctx
+    {
+        executor_ref ex;
+        continuation* cont = nullptr;
+    };
+
+    resume_ctx ctx_;
+
+    static void CUDART_CB
+    on_complete(void* arg)
+    {
+        auto* ctx = static_cast<resume_ctx*>(arg);
+        ctx->ex.post(*ctx->cont);
+    }
+
+    // The paper hardcodes HostToDevice and describes memcpy_d2h as "the
+    // same pattern"; a kind field unifies both without duplicating the
+    // awaitable.
+    struct copy_awaitable
+    {
+        cuda_stream* self;
+        void* dst;
+        void const* src;
+        std::size_t count;
+        cudaMemcpyKind kind;
+
+        bool await_ready() const noexcept
+        {
+            return false;
+        }
+
+        std::coroutine_handle<>
+        await_suspend(std::coroutine_handle<> h, io_env const* env)
+        {
+            auto err = cudaMemcpyAsync(
+                dst, src, count, kind, self->stream_);
+            if(err != cudaSuccess)
+            {
+                self->error_ = make_cuda_error(err);
+                return h;
+            }
+            self->cont_.h = h;
+            self->ctx_ = resume_ctx{env->executor, &self->cont_};
+            err = cudaLaunchHostFunc(
+                self->stream_, &on_complete, &self->ctx_);
+            if(err != cudaSuccess)
+            {
+                self->error_ = make_cuda_error(err);
+                return h;
+            }
+            return std::noop_coroutine();
+        }
+
+        void await_resume()
+        {
+            if(self->error_)
+                throw std::system_error(self->error_);
+            self->error_ = {};
+        }
+    };
+
+    struct sync_awaitable
+    {
+        cuda_stream* self;
+
+        bool await_ready() const noexcept
+        {
+            return false;
+        }
+
+        std::coroutine_handle<>
+        await_suspend(std::coroutine_handle<> h, io_env const* env)
+        {
+            self->cont_.h = h;
+            self->ctx_ = resume_ctx{env->executor, &self->cont_};
+            auto err = cudaLaunchHostFunc(
+                self->stream_, &on_complete, &self->ctx_);
+            if(err != cudaSuccess)
+            {
+                self->error_ = make_cuda_error(err);
+                return h;
+            }
+            return std::noop_coroutine();
+        }
+
+        void await_resume()
+        {
+            if(self->error_)
+                throw std::system_error(self->error_);
+            self->error_ = {};
+        }
+    };
+
+public:
+    cuda_stream()
+    {
+        auto err = cudaStreamCreate(&stream_);
+        if(err != cudaSuccess)
+            throw std::system_error(make_cuda_error(err));
+    }
+
+    ~cuda_stream()
+    {
+        if(stream_)
+            cudaStreamDestroy(stream_);
+    }
+
+    cuda_stream(cuda_stream&& other) noexcept
+        : stream_(std::exchange(other.stream_, nullptr))
+    {
+    }
+
+    cuda_stream& operator=(cuda_stream&& other) noexcept
+    {
+        if(this != &other)
+        {
+            if(stream_)
+                cudaStreamDestroy(stream_);
+            stream_ = std::exchange(other.stream_, nullptr);
+        }
+        return *this;
+    }
+
+    cuda_stream(cuda_stream const&) = delete;
+    cuda_stream& operator=(cuda_stream const&) = delete;
+
+    /// Return the underlying CUDA stream handle.
+    cudaStream_t native_handle() const noexcept
+    {
+        return stream_;
+    }
+
+    /// Asynchronously copy `count` bytes from host `src` to device `dst`.
+    auto memcpy_h2d(void* dst, void const* src, std::size_t count)
+    {
+        return copy_awaitable{
+            this, dst, src, count, cudaMemcpyHostToDevice};
+    }
+
+    /// Asynchronously copy `count` bytes from device `src` to host `dst`.
+    auto memcpy_d2h(void* dst, void const* src, std::size_t count)
+    {
+        return copy_awaitable{
+            this, dst, src, count, cudaMemcpyDeviceToHost};
+    }
+
+    /// Asynchronously wait for all pending stream operations to complete.
+    auto synchronize()
+    {
+        return sync_awaitable{this};
+    }
+};
+
+/// GPU device memory exposed as a WriteStream.
+///
+/// Reshapes the `cuda_stream` memcpy pattern to satisfy `WriteStream`, so device
+/// memory can hide behind `any_write_stream`. Because `cudaMemcpyAsync`
+/// transfers the whole buffer in one operation, `write_some` never
+/// performs a partial write. Errors are delivered via `io_result`
+/// rather than exceptions. Does not own `stream_`; the caller is
+/// responsible for the stream's lifetime.
+class cuda_device_stream
+{
+    cudaStream_t stream_;
+    std::byte* d_ptr_;
+    std::size_t offset_ = 0;
+    continuation cont_;
+    std::error_code error_;
+
+    struct resume_ctx
+    {
+        executor_ref ex;
+        continuation* cont = nullptr;
+    };
+
+    resume_ctx ctx_;
+
+    static void CUDART_CB
+    on_complete(void* arg)
+    {
+        auto* ctx = static_cast<resume_ctx*>(arg);
+        ctx->ex.post(*ctx->cont);
+    }
+
+public:
+    cuda_device_stream(cudaStream_t s, std::byte* device_ptr)
+        : stream_(s)
+        , d_ptr_(device_ptr)
+    {
+    }
+
+    template<ConstBufferSequence Buffers>
+    auto write_some(Buffers buffers)
+    {
+        struct awaitable
+        {
+            cuda_device_stream* self;
+            const_buffer buf;
+
+            bool await_ready() const noexcept
+            {
+                return false;
+            }
+
+            std::coroutine_handle<>
+            await_suspend(std::coroutine_handle<> h, io_env const* env)
+            {
+                auto n = buf.size();
+                auto err = cudaMemcpyAsync(
+                    self->d_ptr_ + self->offset_,
+                    buf.data(), n,
+                    cudaMemcpyHostToDevice,
+                    self->stream_);
+                if(err != cudaSuccess)
+                {
+                    self->error_ = make_cuda_error(err);
+                    return h;
+                }
+                self->cont_.h = h;
+                self->ctx_ = resume_ctx{env->executor, &self->cont_};
+                err = cudaLaunchHostFunc(
+                    self->stream_, &on_complete, &self->ctx_);
+                if(err != cudaSuccess)
+                {
+                    self->error_ = make_cuda_error(err);
+                    return h;
+                }
+                return std::noop_coroutine();
+            }
+
+            io_result<std::size_t>
+            await_resume()
+            {
+                if(self->error_)
+                {
+                    auto ec = self->error_;
+                    self->error_ = {};
+                    return {ec, 0};
+                }
+                auto n = buf.size();
+                self->offset_ += n;
+                return {{}, n};
+            }
+        };
+        return awaitable{this, *capy::begin(buffers)};
+    }
+};
+
+} // namespace example
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/example/cuda/datamovement/cuda_graphs.cu b/example/cuda/datamovement/cuda_graphs.cu
new file mode 100644
index 000000000..eb7e041ab
--- /dev/null
+++ b/example/cuda/datamovement/cuda_graphs.cu
@@ -0,0 +1,69 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#include "cuda_datamovement.hpp"
+
+#include <boost/capy.hpp>
+
+#include <cuda_runtime.h>
+
+#include <cstddef>
+
+namespace capy = boost::capy;
+namespace ex   = capy::example;
+
+namespace {
+
+__global__ void
+kernel_A(float* y, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i < n)
+        y[i] += 1.0f;
+}
+
+__global__ void
+kernel_B(float* y, int n)
+{
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i < n)
+        y[i] *= 2.0f;
+}
+
+// A pre-captured CUDA graph is the inner optimized hot path; the
+// coroutine is the outer, data-dependent loop (copy in, launch the graph,
+// copy out). Graph replay and coroutine orchestration optimize different
+// layers and compose without either subsuming the other.
+[[maybe_unused]] capy::task<>
+graph_replay(ex::cuda_stream& cs, float* d_y, float* h_y, int n)
+{
+    cudaStream_t stream = cs.native_handle();
+
+    cudaGraph_t graph;
+    cudaGraphExec_t instance;
+    dim3 grid(1);
+    dim3 block(static_cast<unsigned>(n));
+
+    cudaStreamBeginCapture(stream, cudaStreamCaptureModeGlobal);
+    kernel_A<<<grid, block, 0, stream>>>(d_y, n);
+    kernel_B<<<grid, block, 0, stream>>>(d_y, n);
+    cudaStreamEndCapture(stream, &graph);
+
+    cudaGraphInstantiate(&instance, graph, 0);
+
+    co_await cs.memcpy_h2d(d_y, h_y, n * sizeof(float));
+    cudaGraphLaunch(instance, stream);
+    co_await cs.synchronize();
+    co_await cs.memcpy_d2h(h_y, d_y, n * sizeof(float));
+
+    cudaGraphExecDestroy(instance);
+    cudaGraphDestroy(graph);
+}
+
+} // namespace
diff --git a/example/cuda/pipeline/CMakeLists.txt b/example/cuda/pipeline/CMakeLists.txt
new file mode 100644
index 000000000..3913b4cf0
--- /dev/null
+++ b/example/cuda/pipeline/CMakeLists.txt
@@ -0,0 +1,58 @@
+#
+# Copyright (c) 2026 Steve Gerbino
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+# CUDA was enabled at the top level when the option was flipped on.
+# Honor a clean error if the user wired around it.
+if(NOT CMAKE_CUDA_COMPILER)
+    message(FATAL_ERROR
+        "example/cuda-pipeline requires CUDA; "
+        "did you set BOOST_CAPY_BUILD_NVEXEC_EXAMPLES?")
+endif()
+
+# Fetch NVIDIA/stdexec independently of bench so the example builds
+# even with BOOST_CAPY_BUILD_BENCH=OFF. If bench has already declared
+# the same content with the same name, this call is a no-op.
+include(FetchContent)
+FetchContent_Declare(
+    stdexec
+    GIT_REPOSITORY https://github.com/NVIDIA/stdexec
+    GIT_TAG        307b83c5689ea7c2e5b31561cdc428697705333e
+    SYSTEM
+    FIND_PACKAGE_ARGS NAMES stdexec
+)
+FetchContent_MakeAvailable(stdexec)
+
+if(NOT TARGET STDEXEC::nvexec)
+    message(FATAL_ERROR
+        "STDEXEC::nvexec target not found after configuring stdexec. "
+        "Ensure CUDA is enabled and STDEXEC_ENABLE_CUDA=ON.")
+endif()
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
+    *.cu *.cuh *.hpp
+    CMakeLists.txt
+    README.md)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_cuda_pipeline ${PFILES})
+
+set_target_properties(capy_example_cuda_pipeline PROPERTIES
+    FOLDER "examples"
+    CUDA_STANDARD 20
+    CUDA_STANDARD_REQUIRED ON
+    CUDA_SEPARABLE_COMPILATION OFF)
+
+target_compile_features(capy_example_cuda_pipeline PRIVATE cxx_std_23)
+
+target_link_libraries(capy_example_cuda_pipeline PRIVATE
+    Boost::capy
+    STDEXEC::stdexec
+    STDEXEC::nvexec
+    CUDA::cudart)
diff --git a/example/cuda/pipeline/Jamfile b/example/cuda/pipeline/Jamfile
new file mode 100644
index 000000000..bbc1545a4
--- /dev/null
+++ b/example/cuda/pipeline/Jamfile
@@ -0,0 +1,12 @@
+#
+# Copyright (c) 2026 Steve Gerbino
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+# This example requires CUDA, nvc++, and NVIDIA/stdexec (nvexec).
+# It is built only via the CMake build (BOOST_CAPY_BUILD_NVEXEC_EXAMPLES=ON);
+# the b2 build does not currently provide CUDA support for capy.
diff --git a/example/cuda/pipeline/README.md b/example/cuda/pipeline/README.md
new file mode 100644
index 000000000..719b48ad5
--- /dev/null
+++ b/example/cuda/pipeline/README.md
@@ -0,0 +1,99 @@
+# CUDA pipeline example
+
+This example demonstrates that `boost::capy::await_sender` and
+`boost::capy::as_sender` compose with NVIDIA's `nvexec::stream_scheduler`,
+not just with CPU schedulers. Two runnable scenes, plus a third that is
+built but not run (P4251R0):
+
+1. **Scene 1 (Direction 1).** A `boost::capy::task` running on
+   `boost::capy::thread_pool` `co_await`s a sender whose terminal action is
+   a real `__global__` SAXPY kernel scheduled on `nvexec::stream_scheduler`.
+   When the CUDA stream signals completion, the coroutine resumes on the
+   capy executor with the kernel's result.
+
+2. **Scene 2 (Direction 2).** `boost::capy::test::stream::read_some` is
+   exposed as a stdexec sender via `boost::capy::as_sender`, composed with
+   `stdexec::upon_error`, and driven by `stdexec::sync_wait`. Two runs: a
+   happy-path read, and a peer-close that exercises the `upon_error` arm.
+
+   The example wraps `read_some` (a raw IoAwaitable) rather than
+   `boost::capy::read` (a `task<io_result<size_t>>`). The bridge's `start()`
+   does not perform symmetric transfer to a wrapped task's own coroutine
+   handle, so wrapping a task in `as_sender` hangs. Wrapping a raw
+   IoAwaitable works because its `await_suspend` is either ready-with-data
+   or returns `noop_coroutine()` after stashing the continuation for the
+   peer to resume.
+
+3. **Scene 3 (P4251R0), built but not run.** `handle_request` shows the
+   inference-handler shape: a type-erased `any_read_source` read, GPU
+   dispatch via `await_sender` over a real nvexec kernel, and a type-erased
+   `any_write_sink` write. It is compiled but not executed (`main` does not
+   call it). The paper's listing runs a host `run_model()` under a
+   device-side `then()`, which does not compile on nvexec (host call from
+   device); this mirrors Scene 1's pattern instead, dispatching a real
+   kernel and hopping `continues_on(cpu)` before the host-only bridge, and
+   takes a CPU scheduler the paper's signature omits.
+
+The bridge headers (`awaitable_sender.hpp`, `sender_awaitable.hpp`) are
+copied verbatim from `bench/stdexec/`; the bridge in the bench was already
+written against NVIDIA/stdexec.
+
+## Prerequisites
+
+- NVIDIA GPU and driver visible to `nvidia-smi`.
+- CUDA toolkit. On Arch: `pacman -S cuda`. CUDA 13.x works.
+- A C++23-capable compiler with both `<coroutine>` support and CUDA
+  device-side compilation. Verified locally with clang 22 as host *and*
+  CUDA compiler.
+- `CMAKE_CXX_STANDARD=23`.
+
+nvc++ from the NVHPC SDK is the nominally blessed compiler for nvexec,
+but nvc++ 26.3 does not enable C++20 coroutines (no `__cpp_impl_coroutine`,
+`co_return` parses as undefined). capy is built on coroutines, so nvc++
+cannot compile capy at present. Clang-cuda is the working alternative.
+
+## Building and running
+
+```
+CXX=clang++ cmake -S . -B build \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_CXX_STANDARD=23 \
+    -DCMAKE_CUDA_COMPILER=clang++ \
+    -DCMAKE_CUDA_HOST_COMPILER=clang++ \
+    -DCMAKE_CUDA_ARCHITECTURES=89 \
+    -DCUDAToolkit_ROOT=/opt/cuda \
+    -DBOOST_CAPY_BUILD_STDEXEC_EXAMPLES=ON \
+    -DBOOST_CAPY_BUILD_NVEXEC_EXAMPLES=ON
+cmake --build build --config Release --target capy_example_cuda_pipeline
+./build/example/cuda-pipeline/capy_example_cuda_pipeline
+```
+
+Replace `89` with your GPU's compute capability (`nvidia-smi
+--query-gpu=compute_cap --format=csv,noheader`).
+
+## Expected output
+
+The exact thread ids vary, but the structure is fixed:
+
+```
+main thread: <tid-main>
+--- scene 1: await_sender( gpu sender ) ---
+  scene1: pre-await on thread <tid-A>
+  scene1: post-await on thread <tid-B>
+  scene1: y[0] = 5
+--- scene 2a: as_sender( read_some ) happy ---
+  scene2 happy: read 13 bytes
+--- scene 2b: as_sender( read_some ) error ---
+  scene2 error: upon_error fired with "eof" (n=0)
+all scenes passed
+```
+
+Exit status is 0 on success and non-zero on any failed assertion or CUDA
+error.
+
+## Scope
+
+Correctness only. No performance measurement; no GPU-side cancellation;
+no multi-device topologies. See
+`docs/superpowers/specs/2026-05-27-stdexec-gpu-example-design.md` for the
+full scope statement.
diff --git a/example/cuda/pipeline/awaitable_sender.hpp b/example/cuda/pipeline/awaitable_sender.hpp
new file mode 100644
index 000000000..f13050bd5
--- /dev/null
+++ b/example/cuda/pipeline/awaitable_sender.hpp
@@ -0,0 +1,568 @@
+//
+// Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_AWAITABLE_SENDER_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_AWAITABLE_SENDER_HPP
+
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/detail/await_suspend_helper.hpp>
+#include <boost/capy/ex/executor_ref.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <stdexec/execution.hpp>
+
+#include <concepts>
+#include <coroutine>
+#include <exception>
+#include <stop_token>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace boost::capy {
+
+// Query CPO for obtaining a Capy-compatible executor
+// from a P2300 environment. The returned object must
+// satisfy Capy's Executor concept. Environments that
+// host IoAwaitables via the as_sender bridge must
+// answer this query.
+struct get_io_executor_t
+{
+    static consteval auto query(
+        stdexec::forwarding_query_t) noexcept -> bool
+    {
+        return true;
+    }
+
+    template<class Env>
+        requires requires(Env const& env) {
+            env.query(
+                std::declval<get_io_executor_t const&>());
+        }
+    auto operator()(Env const& env) const noexcept
+    {
+        return env.query(*this);
+    }
+};
+
+inline constexpr get_io_executor_t get_io_executor{};
+
+namespace detail {
+
+template<class T, class = void>
+struct has_tuple_protocol : std::false_type {};
+
+template<class T>
+struct has_tuple_protocol<T,
+    std::void_t<
+        typename std::tuple_size<T>::type,
+        typename std::tuple_element<0, T>::type>>
+    : std::true_type {};
+
+template<class T, bool = has_tuple_protocol<T>::value>
+struct is_ec_outcome : std::is_same<T, std::error_code> {};
+
+template<class T>
+struct is_ec_outcome<T, true>
+    : std::bool_constant<
+        std::tuple_size_v<T> == 1 &&
+        std::is_same_v<
+            std::tuple_element_t<0, T>,
+            std::error_code>>
+{};
+
+template<class T>
+constexpr bool is_ec_outcome_v =
+    std::is_same_v<T, std::error_code> ||
+    is_ec_outcome<T>::value;
+
+template<class T, bool = has_tuple_protocol<T>::value>
+struct is_compound_ec_result : std::false_type {};
+
+template<class T>
+struct is_compound_ec_result<T, true>
+    : std::bool_constant<
+        std::tuple_size_v<T> >= 2 &&
+        std::is_same_v<
+            std::tuple_element_t<0, T>,
+            std::error_code>>
+{};
+
+template<class T>
+constexpr bool is_compound_ec_result_v =
+    is_compound_ec_result<T>::value;
+
+struct frame_cb
+{
+    void (*resume)(frame_cb*);
+    void (*destroy)(frame_cb*);
+    void* data;
+};
+
+// Return the concrete executor by value, trying get_io_executor
+// on the env first, then falling back to the start scheduler.
+template<class Env>
+auto resolve_executor(Env const& env)
+{
+    if constexpr (requires { get_io_executor(env); })
+        return get_io_executor(env);
+    else
+        return stdexec::get_start_scheduler(env)
+            .query(get_io_executor_t{});
+}
+
+} // namespace detail
+
+/** Sender that wraps an IoAwaitable.
+
+    When connected or co_awaited, the bridge queries
+    the receiver's or promise's environment for a
+    Capy-compatible executor via get_io_executor.
+    The executor is stored by value in the operation
+    state and used to construct the io_env passed to
+    the IoAwaitable's await_suspend.
+
+    @tparam IoAw The IoAwaitable type.
+*/
+template<class IoAw>
+struct awaitable_sender
+{
+    using sender_concept = stdexec::sender_tag;
+
+    using result_type = decltype(
+        std::declval<std::decay_t<IoAw>&>().await_resume());
+
+    static auto make_sigs()
+    {
+        if constexpr (std::is_void_v<result_type>)
+            return stdexec::completion_signatures<
+                stdexec::set_value_t(),
+                stdexec::set_error_t(std::exception_ptr),
+                stdexec::set_stopped_t()>{};
+        else if constexpr (
+            detail::is_compound_ec_result_v<result_type>)
+            return stdexec::completion_signatures<
+                stdexec::set_value_t(
+                    std::tuple_element_t<1, result_type>),
+                stdexec::set_error_t(std::error_code),
+                stdexec::set_error_t(std::exception_ptr),
+                stdexec::set_stopped_t()>{};
+        else if constexpr (
+            detail::is_ec_outcome_v<result_type>)
+            return stdexec::completion_signatures<
+                stdexec::set_value_t(),
+                stdexec::set_error_t(std::error_code),
+                stdexec::set_error_t(std::exception_ptr),
+                stdexec::set_stopped_t()>{};
+        else
+            return stdexec::completion_signatures<
+                stdexec::set_value_t(result_type),
+                stdexec::set_error_t(std::exception_ptr),
+                stdexec::set_stopped_t()>{};
+    }
+
+    using completion_signatures = decltype(make_sigs());
+
+    IoAw aw_;
+
+    template<class Receiver>
+    struct op_state
+    {
+        using operation_state_concept =
+            stdexec::operation_state_tag;
+
+        // Concrete executor type deduced from the receiver's
+        // environment. Stored by value to avoid the dangling
+        // pointer that executor_ref would produce when the
+        // source is a temporary (scheduler query or prop).
+        using executor_type = decltype(
+            detail::resolve_executor(
+                stdexec::get_env(
+                    std::declval<Receiver const&>())));
+
+        IoAw aw_;
+        Receiver rcvr_;
+        executor_type ex_;
+        io_env env_;
+        detail::frame_cb cb_;
+
+        op_state(IoAw aw, Receiver rcvr)
+            : aw_(std::move(aw))
+            , rcvr_(std::move(rcvr))
+            , ex_{}
+            , cb_{}
+        {
+        }
+
+        op_state(op_state const&) = delete;
+        op_state(op_state&&) = delete;
+        op_state& operator=(op_state const&) = delete;
+        op_state& operator=(op_state&&) = delete;
+
+        static void
+        on_resume(detail::frame_cb* p) noexcept
+        {
+            auto* self = static_cast<op_state*>(p->data);
+            self->complete();
+        }
+
+        static void
+        on_destroy(detail::frame_cb*) noexcept
+        {
+        }
+
+        void complete() noexcept
+        {
+            try
+            {
+                if constexpr (std::is_void_v<result_type>)
+                {
+                    aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                        stdexec::set_stopped(
+                            std::move(rcvr_));
+                    else
+                        stdexec::set_value(
+                            std::move(rcvr_));
+                }
+                else if constexpr (
+                    detail::is_compound_ec_result_v<result_type>)
+                {
+                    auto result = aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                    {
+                        stdexec::set_stopped(
+                            std::move(rcvr_));
+                    }
+                    else
+                    {
+                        auto ec = get<0>(result);
+                        if(!ec)
+                            stdexec::set_value(
+                                std::move(rcvr_),
+                                get<1>(std::move(result)));
+                        else
+                            stdexec::set_error(
+                                std::move(rcvr_), ec);
+                    }
+                }
+                else if constexpr (
+                    detail::is_ec_outcome_v<result_type>)
+                {
+                    auto result = aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                    {
+                        stdexec::set_stopped(
+                            std::move(rcvr_));
+                    }
+                    else
+                    {
+                        std::error_code ec;
+                        if constexpr (std::is_same_v<
+                            result_type, std::error_code>)
+                            ec = result;
+                        else
+                            ec = get<0>(result);
+                        if(!ec)
+                            stdexec::set_value(
+                                std::move(rcvr_));
+                        else
+                            stdexec::set_error(
+                                std::move(rcvr_), ec);
+                    }
+                }
+                else
+                {
+                    auto result = aw_.await_resume();
+                    if(env_.stop_token.stop_requested())
+                        stdexec::set_stopped(
+                            std::move(rcvr_));
+                    else
+                        stdexec::set_value(
+                            std::move(rcvr_),
+                            std::move(result));
+                }
+            }
+            catch(...)
+            {
+                stdexec::set_error(
+                    std::move(rcvr_),
+                    std::current_exception());
+            }
+        }
+
+        void start() noexcept
+        {
+            auto renv = stdexec::get_env(rcvr_);
+            ex_ = detail::resolve_executor(renv);
+
+            std::stop_token st;
+            if constexpr (requires {
+                { renv.query(stdexec::get_stop_token_t{}) }
+                    -> std::convertible_to<std::stop_token>; })
+            {
+                st = renv.query(
+                    stdexec::get_stop_token_t{});
+            }
+
+            env_ = io_env{ex_, st, nullptr};
+
+            if(aw_.await_ready())
+            {
+                complete();
+                return;
+            }
+
+            cb_.resume = &on_resume;
+            cb_.destroy = &on_destroy;
+            cb_.data = this;
+
+            auto h = std::coroutine_handle<>::from_address(
+                static_cast<void*>(&cb_));
+
+            auto resumed = detail::call_await_suspend(
+                &aw_, h, &env_);
+            if(resumed == h)
+                complete();
+        }
+    };
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) &&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            std::move(aw_), std::move(rcvr));
+    }
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) const&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(aw_, std::move(rcvr));
+    }
+
+    // Bypass stdexec's sender_awaitable when co_awaited
+    // from a coroutine that provides get_io_executor or
+    // a start scheduler with get_io_executor. Adapts the
+    // IoAwaitable's 2-arg await_suspend to the standard
+    // 1-arg protocol.
+    template<class Promise>
+    auto as_awaitable(Promise& promise) &&
+    {
+        auto penv = promise.get_env();
+        auto ex = detail::resolve_executor(penv);
+
+        std::stop_token st;
+        if constexpr (requires {
+            { penv.query(stdexec::get_stop_token_t{}) }
+                -> std::convertible_to<std::stop_token>; })
+        {
+            st = penv.query(
+                stdexec::get_stop_token_t{});
+        }
+
+        using executor_type = decltype(ex);
+
+        struct aw
+        {
+            IoAw aw_;
+            executor_type ex_;
+            std::stop_token st_;
+            io_env env_;
+
+            bool await_ready() noexcept
+            {
+                return aw_.await_ready();
+            }
+
+            std::coroutine_handle<>
+            await_suspend(std::coroutine_handle<> h)
+            {
+                env_ = io_env{ex_, st_, nullptr};
+                return aw_.await_suspend(h, &env_);
+            }
+
+            auto await_resume()
+            {
+                return aw_.await_resume();
+            }
+        };
+
+        return aw{std::move(aw_), std::move(ex), st, {}};
+    }
+};
+
+/** Create a stdexec sender from an IoAwaitable.
+
+    The bridge routes the awaitable's result through sender
+    channels based on its type:
+
+    - `void` - calls `set_value()`.
+    - `error_code` (or a single-element tuple-like whose
+      element 0 is `error_code`) - calls `set_value()`
+      when the code is zero, `set_error(ec)` otherwise.
+    - Any other single value `T` - calls `set_value(T)`.
+    - Compound results whose element 0 is `error_code`
+      with additional elements are rejected at compile
+      time. Wrap the operation in a `task<error_code>`
+      that inspects the compound result and returns the
+      error code.
+
+    When connected or co_awaited, the bridge queries the
+    receiver's or promise's environment for a Capy executor
+    via get_io_executor. The environment must answer this
+    query with an object satisfying Capy's Executor concept.
+
+    @param aw The IoAwaitable to wrap.
+    @return A sender whose completion channels reflect
+        the awaitable's result type.
+*/
+template<class IoAw>
+auto as_sender(IoAw&& aw)
+{
+    return awaitable_sender<std::decay_t<IoAw>>{
+        std::forward<IoAw>(aw)};
+}
+
+// split_ec: sender adapter that routes error_code to
+// set_value() or set_error(ec) at runtime.
+
+namespace detail {
+
+template<class Sender>
+struct split_ec_sender
+{
+    using sender_concept = stdexec::sender_tag;
+
+    using completion_signatures =
+        stdexec::completion_signatures<
+            stdexec::set_value_t(),
+            stdexec::set_error_t(std::error_code),
+            stdexec::set_error_t(std::exception_ptr),
+            stdexec::set_stopped_t()>;
+
+    Sender sndr_;
+
+    template<class Receiver>
+    struct ec_receiver
+    {
+        using receiver_concept = stdexec::receiver_tag;
+
+        Receiver rcvr_;
+
+        auto get_env() const noexcept
+        {
+            return stdexec::get_env(rcvr_);
+        }
+
+        void set_value(std::error_code ec) && noexcept
+        {
+            if (!ec)
+                stdexec::set_value(
+                    std::move(rcvr_));
+            else
+                stdexec::set_error(
+                    std::move(rcvr_), ec);
+        }
+
+        void set_value() && noexcept
+        {
+            stdexec::set_value(
+                std::move(rcvr_));
+        }
+
+        template<class E>
+        void set_error(E&& e) && noexcept
+        {
+            stdexec::set_error(
+                std::move(rcvr_),
+                std::forward<E>(e));
+        }
+
+        void set_stopped() && noexcept
+        {
+            stdexec::set_stopped(
+                std::move(rcvr_));
+        }
+    };
+
+    template<class Receiver>
+    struct op_state
+    {
+        using operation_state_concept =
+            stdexec::operation_state_tag;
+
+        using inner_op_t = decltype(
+            stdexec::connect(
+                std::declval<Sender>(),
+                std::declval<ec_receiver<Receiver>>()));
+
+        inner_op_t op_;
+
+        op_state(Sender sndr, Receiver rcvr)
+            : op_(stdexec::connect(
+                std::move(sndr),
+                ec_receiver<Receiver>{std::move(rcvr)}))
+        {
+        }
+
+        op_state(op_state const&) = delete;
+        op_state(op_state&&) = delete;
+        op_state& operator=(op_state const&) = delete;
+        op_state& operator=(op_state&&) = delete;
+
+        void start() noexcept
+        {
+            stdexec::start(op_);
+        }
+    };
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) &&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            std::move(sndr_), std::move(rcvr));
+    }
+
+    template<class Receiver>
+    auto connect(Receiver rcvr) const&
+        -> op_state<Receiver>
+    {
+        return op_state<Receiver>(
+            sndr_, std::move(rcvr));
+    }
+};
+
+} // namespace detail
+
+/** Split an `error_code` value channel into success and error channels.
+
+    Takes a sender that completes with `set_value(error_code)` and
+    routes it at runtime: `set_value()` when the code is zero,
+    `set_error(ec)` otherwise. No exceptions.
+
+    @param sndr The predecessor sender.
+    @return A sender completing with `set_value()`,
+        `set_error(error_code)`, or `set_stopped()`.
+*/
+template<class Sender>
+auto split_ec(Sender&& sndr)
+{
+    return detail::split_ec_sender<
+        std::decay_t<Sender>>{
+            std::forward<Sender>(sndr)};
+}
+
+} // namespace boost::capy
+
+#endif
diff --git a/example/cuda/pipeline/cuda_pipeline.cu b/example/cuda/pipeline/cuda_pipeline.cu
new file mode 100644
index 000000000..1071bd340
--- /dev/null
+++ b/example/cuda/pipeline/cuda_pipeline.cu
@@ -0,0 +1,374 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Scene 1 (Direction 1): a capy coroutine awaits a sender whose
+// terminal action is a real CUDA __global__ kernel scheduled on
+// nvexec::stream_scheduler.
+//
+// Scene 2 (Direction 2): a capy IoAwaitable (capy::read over a
+// deterministic in-process stream pair) is exposed as a stdexec
+// sender, then composed with stdexec::upon_error, and consumed
+// via stdexec::sync_wait. Both the happy path and an injected-eof
+// path are exercised.
+//
+
+#include "awaitable_sender.hpp"
+#include "sender_awaitable.hpp"
+
+#include <boost/capy.hpp>
+#include <boost/capy/io/any_read_source.hpp>
+#include <boost/capy/io/any_write_sink.hpp>
+#include <boost/capy/test/stream.hpp>
+
+#include <stdexec/execution.hpp>
+#include <exec/static_thread_pool.hpp>
+#include <nvexec/stream_context.cuh>
+
+#include <array>
+#include <cassert>
+#include <cstddef>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <latch>
+#include <string_view>
+#include <system_error>
+#include <thread>
+#include <utility>
+
+#include <cuda_runtime.h>
+
+namespace capy = boost::capy;
+namespace ex   = stdexec;
+
+namespace {
+
+void cuda_check(cudaError_t e, char const* where)
+{
+    if (e != cudaSuccess)
+    {
+        std::cerr << where << ": " << cudaGetErrorString(e) << '\n';
+        std::abort();
+    }
+}
+
+// Scene 1: capy coroutine awaits a nvexec-scheduled SAXPY kernel.
+// Returns the host-side value at y[0] after kernel completion.
+//
+// Pipeline:
+//   just(N, a, x, y)
+//     | continues_on(gpu)                  switch onto nvexec stream
+//     | nvexec::launch(<<<grid, block>>>)  __global__ kernel on stream
+//     | continues_on(cpu)                  transfer completion back to host
+//
+// The trailing continues_on(cpu) is load-bearing: the as-written
+// nvexec adapters complete on device, but the bridge's
+// bridge_receiver is host-only. The host hop must happen before the
+// bridge connects.
+capy::task<float>
+scene1(nvexec::stream_scheduler gpu,
+       stdexec::scheduler auto cpu)
+{
+    constexpr int   N           = 1 << 16;
+    constexpr int   BLOCK       = 256;
+    constexpr int   GRID        = (N + BLOCK - 1) / BLOCK;
+    constexpr float a           = 3.0f;
+
+    float* d_x = nullptr;
+    float* d_y = nullptr;
+    cuda_check(cudaMalloc(&d_x, N * sizeof(float)), "cudaMalloc x");
+    cuda_check(cudaMalloc(&d_y, N * sizeof(float)), "cudaMalloc y");
+
+    std::vector<float> h_x(N, 1.0f);
+    std::vector<float> h_y(N, 2.0f);
+    cuda_check(cudaMemcpy(d_x, h_x.data(),
+        N * sizeof(float), cudaMemcpyHostToDevice), "H2D x");
+    cuda_check(cudaMemcpy(d_y, h_y.data(),
+        N * sizeof(float), cudaMemcpyHostToDevice), "H2D y");
+
+    auto const enter_tid = std::this_thread::get_id();
+    std::cout
+        << "  scene1: pre-await on thread "
+        << enter_tid << '\n';
+
+    co_await capy::await_sender(
+        ex::just(N, a, d_x, d_y)
+        | ex::continues_on(gpu)
+        | nvexec::launch({.grid_size = GRID, .block_size = BLOCK},
+            [] (cudaStream_t, int n, float k, float const* x, float* y) {
+                int i = blockIdx.x * blockDim.x + threadIdx.x;
+                if (i < n)
+                    y[i] = k * x[i] + y[i];
+            })
+        | ex::continues_on(cpu));
+
+    auto const resume_tid = std::this_thread::get_id();
+    std::cout
+        << "  scene1: post-await on thread "
+        << resume_tid << '\n';
+
+    // The resume thread is a capy worker that never touched CUDA; it
+    // has no current context. cudaSetDevice establishes one before
+    // the cleanup calls run.
+    cuda_check(cudaSetDevice(0), "cudaSetDevice");
+
+    float h_y0 = 0.0f;
+    cuda_check(cudaMemcpy(&h_y0, d_y,
+        sizeof(float), cudaMemcpyDeviceToHost), "D2H y[0]");
+
+    cuda_check(cudaFree(d_x), "cudaFree x");
+    cuda_check(cudaFree(d_y), "cudaFree y");
+
+    co_return h_y0;
+}
+
+// Scene 3 (P4251R0): the inference-handler shape. Network I/O uses
+// type-erased coroutine streams (any_read_source / any_write_sink); GPU
+// dispatch uses a sender bridged with await_sender. The paper's
+// listing runs a host run_model() under a device-side then(), which does
+// not compile on nvexec; this mirrors Scene 1 instead, dispatching a real
+// kernel and hopping continues_on(cpu) before the host-only bridge.
+[[maybe_unused]] capy::task<>
+handle_request(
+    capy::any_read_source& client,
+    capy::any_write_sink& response,
+    nvexec::stream_context& gpu_ctx,
+    exec::static_thread_pool::scheduler cpu)
+{
+    // receive request (coroutine, type-erased)
+    std::array<std::byte, 4096> buf;
+    auto [ec, n] = co_await client.read_some(
+        capy::mutable_buffer(buf.data(), buf.size()));
+    if(ec)
+        co_return;
+    (void) n;
+
+    // dispatch to GPU (sender, compile-time composition)
+    auto gpu = gpu_ctx.get_scheduler();
+    constexpr int N = 64;
+    float* d_y = nullptr;
+    cuda_check(cudaMalloc(&d_y, N * sizeof(float)), "scene3 malloc");
+
+    co_await capy::await_sender(
+        ex::just(N, d_y)
+        | ex::continues_on(gpu)
+        | nvexec::launch({.grid_size = 1, .block_size = N},
+            [] (cudaStream_t, int len, float* y) {
+                int i = blockIdx.x * blockDim.x + threadIdx.x;
+                if (i < len)
+                    y[i] = static_cast<float>(i);
+            })
+        | ex::continues_on(cpu));
+
+    cuda_check(cudaSetDevice(0), "scene3 setdevice");
+    std::array<float, N> result{};
+    cuda_check(cudaMemcpy(result.data(), d_y,
+        N * sizeof(float), cudaMemcpyDeviceToHost), "scene3 D2H");
+    cuda_check(cudaFree(d_y), "scene3 free");
+
+    // send result back (coroutine, type-erased)
+    auto [wec, wn] = co_await capy::write(response,
+        capy::make_buffer(result.data(), result.size() * sizeof(float)));
+    (void) wec;
+    (void) wn;
+}
+
+// Adapter run_async-like driver: kicks off scene1 on the capy
+// thread_pool, blocks the caller until it completes, and returns
+// the result via the supplied storage.
+void
+run_scene1(capy::thread_pool& pool, float& out)
+{
+    std::latch done(1);
+    std::exception_ptr err;
+
+    auto on_ok = [&](float v) noexcept {
+        out = v;
+        done.count_down();
+    };
+    auto on_err = [&](std::exception_ptr ep) noexcept {
+        err = ep;
+        done.count_down();
+    };
+
+    nvexec::stream_context stream_ctx;
+    exec::static_thread_pool cpu_pool(1);
+    capy::run_async(
+        pool.get_executor(),
+        on_ok,
+        on_err)(scene1(
+            stream_ctx.get_scheduler(),
+            cpu_pool.get_scheduler()));
+
+    done.wait();
+    if (err)
+        std::rethrow_exception(err);
+}
+
+// Scene 2: capy::read exposed as a stdexec sender, composed with
+// stdexec::upon_error, driven by sync_wait. write_env injects the
+// capy executor that the as_sender bridge needs to drive the
+// underlying IoAwaitable.
+// stream::read_some returns a raw IoAwaitable, which the bridge
+// expects. (capy::read returns a task<io_result<size_t>>, and the
+// bridge's start() does not perform symmetric transfer to the
+// task's own handle, so wrapping a task hangs.)
+void
+scene2_happy_path(capy::thread_pool& pool)
+{
+    constexpr std::string_view payload = "payload bytes";
+
+    auto [a, b] = capy::test::make_stream_pair();
+    b.provide(payload);
+
+    char buf[64];
+    auto sndr = ex::write_env(
+        capy::as_sender(
+            a.read_some(capy::mutable_buffer(buf, sizeof buf))),
+        ex::prop{capy::get_io_executor, pool.get_executor()})
+        | ex::upon_error([](auto e) noexcept -> std::size_t {
+            if constexpr (std::is_same_v<
+                std::decay_t<decltype(e)>, std::error_code>)
+            {
+                std::cerr
+                    << "  scene2 happy: unexpected error: "
+                    << e.message() << '\n';
+            }
+            std::abort();
+        });
+
+    auto result = ex::sync_wait(std::move(sndr));
+    assert(result.has_value());
+    auto const [n] = *result;
+    assert(n == payload.size());
+    assert(std::string_view(buf, n) == payload);
+
+    std::cout
+        << "  scene2 happy: read " << n
+        << " bytes\n";
+}
+
+void
+scene2_error_path(capy::thread_pool& pool)
+{
+    auto [a, b] = capy::test::make_stream_pair();
+    b.close();
+
+    char buf[64];
+    bool fired = false;
+    std::error_code observed;
+
+    auto sndr = ex::write_env(
+        capy::as_sender(
+            a.read_some(capy::mutable_buffer(buf, sizeof buf))),
+        ex::prop{capy::get_io_executor, pool.get_executor()})
+        | ex::upon_error([&](auto e) noexcept -> std::size_t {
+            if constexpr (std::is_same_v<
+                std::decay_t<decltype(e)>, std::error_code>)
+            {
+                fired = true;
+                observed = e;
+            }
+            return 0;
+        });
+
+    auto result = ex::sync_wait(std::move(sndr));
+    assert(result.has_value());
+    auto const [n] = *result;
+
+    assert(fired);
+    assert(observed);
+    std::cout
+        << "  scene2 error: upon_error fired with \""
+        << observed.message() << "\" (n=" << n << ")\n";
+}
+
+} // namespace
+
+// Minimal "send a value through the GPU, get it back" coroutine.
+// Sanity check that the smallest plausible shape compiles and runs.
+namespace mini {
+
+capy::task<int>
+gpu_add_one(int input,
+            nvexec::stream_scheduler gpu,
+            stdexec::scheduler auto cpu)
+{
+    int* d_out = nullptr;
+    cudaMalloc(&d_out, sizeof(int));
+
+    co_await capy::await_sender(
+        ex::just(input, d_out)
+        | ex::continues_on(gpu)
+        | nvexec::launch({.grid_size = 1, .block_size = 1},
+            [](cudaStream_t, int x, int* y) { *y = x + 1; })
+        | ex::continues_on(cpu));
+
+    cudaSetDevice(0);
+    int h_out;
+    cudaMemcpy(&h_out, d_out, sizeof(int),
+        cudaMemcpyDeviceToHost);
+    cudaFree(d_out);
+    co_return h_out;
+}
+
+void
+run(capy::thread_pool& pool, int input, int& out)
+{
+    std::latch done(1);
+    std::exception_ptr err;
+
+    nvexec::stream_context stream_ctx;
+    exec::static_thread_pool cpu_pool(1);
+    capy::run_async(
+        pool.get_executor(),
+        [&](int v) noexcept { out = v; done.count_down(); },
+        [&](std::exception_ptr ep) noexcept {
+            err = ep; done.count_down(); })(
+        gpu_add_one(input,
+            stream_ctx.get_scheduler(),
+            cpu_pool.get_scheduler()));
+
+    done.wait();
+    if (err) std::rethrow_exception(err);
+}
+
+} // namespace mini
+
+int main()
+{
+    std::cout
+        << "main thread: "
+        << std::this_thread::get_id() << '\n';
+
+    capy::thread_pool pool;
+
+    std::cout << "--- scene 0: minimal gpu_add_one ---\n";
+    int out = 0;
+    mini::run(pool, 41, out);
+    std::cout << "  scene 0: 41 + 1 -> " << out << '\n';
+    assert(out == 42);
+
+    std::cout << "--- scene 1: await_sender( gpu sender ) ---\n";
+    float y0 = 0.0f;
+    run_scene1(pool, y0);
+    std::cout << "  scene1: y[0] = " << y0 << '\n';
+    // a*x + y = 3*1 + 2 = 5
+    assert(y0 == 5.0f);
+
+    std::cout << "--- scene 2a: as_sender( read_some ) happy ---\n";
+    scene2_happy_path(pool);
+
+    std::cout << "--- scene 2b: as_sender( read_some ) error ---\n";
+    scene2_error_path(pool);
+
+    std::cout << "all scenes passed\n";
+    return 0;
+}
diff --git a/example/cuda/pipeline/sender_awaitable.hpp b/example/cuda/pipeline/sender_awaitable.hpp
new file mode 100644
index 000000000..c27ac505d
--- /dev/null
+++ b/example/cuda/pipeline/sender_awaitable.hpp
@@ -0,0 +1,429 @@
+//
+// Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BENCH_STDEXEC_SENDER_AWAITABLE_HPP
+#define BOOST_CAPY_BENCH_STDEXEC_SENDER_AWAITABLE_HPP
+
+#include <boost/capy/error.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <stdexec/execution.hpp>
+
+#include <atomic>
+#include <coroutine>
+#include <exception>
+#include <new>
+#include <stop_token>
+#include <system_error>
+#include <tuple>
+#include <type_traits>
+#include <variant>
+
+namespace boost::capy {
+
+namespace detail {
+
+struct stopped_t {};
+
+struct operation_cancelled {};
+
+struct bridge_env
+{
+    std::stop_token st_;
+
+    auto query(
+        stdexec::get_stop_token_t const&)
+            const noexcept
+    {
+        return st_;
+    }
+};
+
+template<class Sender>
+using sender_single_value_t =
+    stdexec::value_types_of_t<
+        Sender,
+        bridge_env,
+        std::tuple,
+        std::type_identity_t>;
+
+// Detect whether a sender can complete with
+// set_error(std::error_code).
+template<class Sender>
+struct has_error_code_completion
+{
+    template<class... Es>
+    struct checker
+    {
+        static constexpr bool value =
+            (std::is_same_v<
+                Es, std::error_code> || ...);
+    };
+
+    static constexpr bool value =
+        stdexec::error_types_of_t<
+            Sender,
+            bridge_env,
+            checker>::value;
+};
+
+template<class Sender>
+constexpr bool has_error_code_v =
+    has_error_code_completion<Sender>::value;
+
+// Variant when sender can complete with
+// set_error(error_code): separate slot so
+// error_code is not wrapped in exception_ptr.
+template<class ValueTuple>
+using ec_result_variant = std::variant<
+    std::monostate,
+    ValueTuple,
+    std::error_code,
+    std::exception_ptr,
+    stopped_t>;
+
+// Variant when sender does not complete with
+// set_error(error_code).
+template<class ValueTuple>
+using no_ec_result_variant = std::variant<
+    std::monostate,
+    ValueTuple,
+    std::exception_ptr,
+    stopped_t>;
+
+template<class ValueTuple, bool HasEc>
+using result_variant = std::conditional_t<
+    HasEc,
+    ec_result_variant<ValueTuple>,
+    no_ec_result_variant<ValueTuple>>;
+
+// Bridge receiver that stores the sender's
+// completion result and resumes the coroutine.
+// Uses an atomic flag shared with await_suspend
+// to handle synchronous completion safely:
+// whichever side (set_value or await_suspend)
+// arrives second is responsible for resumption.
+template<class ValueTuple, bool HasEc>
+struct bridge_receiver
+{
+    using receiver_concept =
+        stdexec::receiver_t;
+
+    result_variant<ValueTuple, HasEc>* result_;
+    std::coroutine_handle<>            cont_;
+    std::stop_token                    st_;
+    std::atomic<bool>*                 done_;
+
+    auto get_env() const noexcept -> bridge_env
+    {
+        return {st_};
+    }
+
+    void resume_if_ready() noexcept
+    {
+        if(done_->exchange(
+            true, std::memory_order_acq_rel))
+            cont_.resume();
+    }
+
+    template<class... Args>
+    void set_value(Args&&... args) && noexcept
+    {
+        result_->template emplace<1>(
+            std::forward<Args>(args)...);
+        resume_if_ready();
+    }
+
+    template<class E>
+    void set_error(E&& e) && noexcept
+    {
+        if constexpr (
+            HasEc &&
+            std::is_same_v<
+                std::decay_t<E>,
+                std::error_code>)
+            result_->template emplace<2>(
+                std::forward<E>(e));
+        else if constexpr (
+            std::is_same_v<
+                std::decay_t<E>,
+                std::exception_ptr>)
+        {
+            constexpr auto idx = HasEc ? 3 : 2;
+            result_->template emplace<idx>(
+                std::forward<E>(e));
+        }
+        else
+        {
+            constexpr auto idx = HasEc ? 3 : 2;
+            result_->template emplace<idx>(
+                std::make_exception_ptr(
+                    std::forward<E>(e)));
+        }
+        resume_if_ready();
+    }
+
+    void set_stopped() && noexcept
+    {
+        constexpr auto idx = HasEc ? 4 : 3;
+        result_->template emplace<idx>(
+            stopped_t{});
+        resume_if_ready();
+    }
+};
+
+} // namespace detail
+
+/** Awaitable that bridges a stdexec sender
+    into a Capy coroutine.
+
+    Satisfies IoAwaitable. When co_awaited inside
+    a capy::task, connects the sender to a bridge
+    receiver, starts the operation, and resumes
+    the coroutine when the sender completes.
+
+    Uses an atomic exchange protocol to handle
+    senders that complete synchronously during
+    start(): whichever side arrives second
+    (receiver or await_suspend) resumes the
+    coroutine.
+
+    The bridge inspects the sender's error
+    completion signatures at compile time. If the
+    sender can complete with
+    set_error(std::error_code), await_resume
+    returns io_result so the error code is a
+    value, not an exception. Otherwise
+    await_resume returns the value directly and
+    genuine exceptions are rethrown.
+
+    @tparam Sender The stdexec sender type.
+*/
+template<class Sender>
+struct [[nodiscard]] sender_awaitable
+{
+    static constexpr bool has_ec =
+        detail::has_error_code_v<Sender>;
+
+    using value_tuple =
+        detail::sender_single_value_t<Sender>;
+    using variant_type =
+        detail::result_variant<
+            value_tuple, has_ec>;
+    using receiver_type =
+        detail::bridge_receiver<
+            value_tuple, has_ec>;
+    using op_state_type = decltype(
+        stdexec::connect(
+            std::declval<Sender>(),
+            std::declval<receiver_type>()));
+
+    Sender sndr_;
+    variant_type result_{};
+
+    alignas(op_state_type)
+    unsigned char op_buf_[sizeof(op_state_type)];
+    bool op_constructed_ = false;
+    std::atomic<bool> done_{false};
+
+    explicit sender_awaitable(Sender sndr)
+        : sndr_(std::move(sndr))
+    {
+    }
+
+    sender_awaitable(sender_awaitable&& o)
+        noexcept(
+            std::is_nothrow_move_constructible_v<
+                Sender>)
+        : sndr_(std::move(o.sndr_))
+    {
+    }
+
+    sender_awaitable(
+        sender_awaitable const&) = delete;
+    sender_awaitable& operator=(
+        sender_awaitable const&) = delete;
+    sender_awaitable& operator=(
+        sender_awaitable&&) = delete;
+
+    ~sender_awaitable()
+    {
+        if(op_constructed_)
+            std::launder(
+                reinterpret_cast<op_state_type*>(
+                    op_buf_))->~op_state_type();
+    }
+
+    bool await_ready() const noexcept
+    {
+        return false;
+    }
+
+    std::coroutine_handle<>
+    await_suspend(
+        std::coroutine_handle<> h,
+        io_env const* env)
+    {
+        ::new(op_buf_) op_state_type(
+            stdexec::connect(
+                std::move(sndr_),
+                receiver_type{
+                    &result_, h,
+                    env->stop_token, &done_}));
+        op_constructed_ = true;
+        stdexec::start(
+            *std::launder(
+                reinterpret_cast<
+                    op_state_type*>(
+                        op_buf_)));
+
+        // If the sender completed during start(),
+        // the receiver already stored the result.
+        // Return h to resume without suspending.
+        if(done_.exchange(
+            true, std::memory_order_acq_rel))
+            return h;
+        return std::noop_coroutine();
+    }
+
+    auto await_resume()
+    {
+        if constexpr (has_ec)
+            return await_resume_ec();
+        else
+            return await_resume_no_ec();
+    }
+
+private:
+    // Sender can complete with
+    // set_error(error_code). Return io_result
+    // so the error code is a value, not an
+    // exception.
+    auto await_resume_ec()
+    {
+        // exception_ptr at index 3
+        if(result_.index() == 3)
+            std::rethrow_exception(
+                std::get<3>(result_));
+
+        if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 0)
+        {
+            // stopped at index 4
+            if(result_.index() == 4)
+                return io_result<>{
+                    make_error_code(
+                        error::canceled)};
+            if(result_.index() == 2)
+                return io_result<>{
+                    std::get<2>(result_)};
+            return io_result<>{};
+        }
+        else if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 1)
+        {
+            using T = std::tuple_element_t<
+                0, value_tuple>;
+            if(result_.index() == 4)
+                return io_result<T>{
+                    make_error_code(
+                        error::canceled), T{}};
+            if(result_.index() == 2)
+                return io_result<T>{
+                    std::get<2>(result_), T{}};
+            return io_result<T>{
+                {},
+                std::get<0>(
+                    std::get<1>(
+                        std::move(result_)))};
+        }
+        else
+        {
+            if(result_.index() == 4)
+                return io_result<value_tuple>{
+                    make_error_code(
+                        error::canceled), value_tuple{}};
+            if(result_.index() == 2)
+                return io_result<value_tuple>{
+                    std::get<2>(result_), value_tuple{}};
+            return io_result<value_tuple>{
+                {},
+                std::get<1>(
+                    std::move(result_))};
+        }
+    }
+
+    // Sender does not complete with
+    // set_error(error_code). Return the value
+    // directly; rethrow exceptions.
+    auto await_resume_no_ec()
+    {
+        // exception_ptr at index 2
+        if(result_.index() == 2)
+            std::rethrow_exception(
+                std::get<2>(result_));
+        // stopped at index 3
+        if(result_.index() == 3)
+            throw detail::operation_cancelled{};
+
+        if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 0)
+            return;
+        else if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 1)
+            return std::get<0>(
+                std::get<1>(
+                    std::move(result_)));
+        else
+            return std::get<1>(
+                std::move(result_));
+    }
+};
+
+/** Create an IoAwaitable from a stdexec sender.
+
+    If the sender can complete with
+    set_error(std::error_code), the returned
+    awaitable yields io_result so the error code
+    is a value, not an exception. Otherwise the
+    awaitable yields the value directly.
+
+    @par Example
+    @code
+    capy::task<int> compute(auto sched)
+    {
+        auto result = co_await await_sender(
+            stdexec::schedule(sched)
+                | stdexec::then(
+                    [] { return 42; }));
+        co_return result;
+    }
+    @endcode
+
+    @param sndr The sender to bridge.
+    @return An IoAwaitable that can be co_awaited
+        in a capy::task.
+*/
+template<class Sender>
+auto await_sender(Sender&& sndr)
+{
+    return sender_awaitable<
+        std::decay_t<Sender>>(
+            std::forward<Sender>(sndr));
+}
+
+} // namespace boost::capy
+
+#endif
diff --git a/example/custom-dynamic-buffer/custom_dynamic_buffer.cpp b/example/custom-dynamic-buffer/custom_dynamic_buffer.cpp
index accf86cec..995ce7ae1 100644
--- a/example/custom-dynamic-buffer/custom_dynamic_buffer.cpp
+++ b/example/custom-dynamic-buffer/custom_dynamic_buffer.cpp
@@ -17,7 +17,7 @@
 #include <cassert>
 #include <cstring>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 // Custom dynamic buffer with statistics tracking
 class tracked_buffer
@@ -42,9 +42,9 @@ class tracked_buffer
     // === DynamicBuffer interface ===
     
     // Consumer: readable data
-    const_buffer data() const noexcept
+    capy::const_buffer data() const noexcept
     {
-        return const_buffer(
+        return capy::const_buffer(
             storage_.data() + read_pos_,
             write_pos_ - read_pos_);
     }
@@ -66,7 +66,7 @@ class tracked_buffer
     }
     
     // Producer: prepare space for writing
-    mutable_buffer prepare(std::size_t n)
+    capy::mutable_buffer prepare(std::size_t n)
     {
         total_prepared_ += n;
         
@@ -84,7 +84,7 @@ class tracked_buffer
         if (required > storage_.size())
             storage_.resize(required);
         
-        return mutable_buffer(
+        return capy::mutable_buffer(
             storage_.data() + write_pos_,
             n);
     }
@@ -137,7 +137,7 @@ class tracked_buffer
 };
 
 // Demonstrate using the custom buffer
-task<> read_into_tracked_buffer(test::stream& stream, tracked_buffer& buffer)
+capy::task<> read_into_tracked_buffer(capy::test::stream& stream, tracked_buffer& buffer)
 {
     // Read data in chunks
     while (true)
@@ -146,7 +146,7 @@ task<> read_into_tracked_buffer(test::stream& stream, tracked_buffer& buffer)
         // ec: std::error_code, n: std::size_t
         auto [ec, n] = co_await stream.read_some(space);
         
-        if (ec == cond::eof)
+        if (ec == capy::cond::eof)
             break;
         
         if (ec)
@@ -163,7 +163,7 @@ void demo_tracked_buffer()
 {
     std::cout << "=== Tracked Buffer Demo ===\n\n";
     
-    auto [reader, writer] = test::make_stream_pair();
+    auto [reader, writer] = capy::test::make_stream_pair();
     writer.provide("Hello, ");
     writer.provide("World! ");
     writer.provide("This is a test of the custom buffer.\n");
@@ -171,7 +171,7 @@ void demo_tracked_buffer()
     
     tracked_buffer buffer;
     
-    test::run_blocking()(read_into_tracked_buffer(reader, buffer));
+    capy::test::run_blocking()(read_into_tracked_buffer(reader, buffer));
     
     std::cout << "\nFinal buffer contents: ";
     auto data = buffer.data();  // const_buffer
diff --git a/example/custom-executor/CMakeLists.txt b/example/custom-executor/CMakeLists.txt
new file mode 100644
index 000000000..a54450c00
--- /dev/null
+++ b/example/custom-executor/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2026 Mungo Gill
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS *.cpp *.hpp
+    CMakeLists.txt
+    Jamfile)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_custom_executor ${PFILES})
+
+set_property(TARGET capy_example_custom_executor
+    PROPERTY FOLDER "examples")
+
+target_link_libraries(capy_example_custom_executor
+    Boost::capy)
diff --git a/example/custom-executor/custom_executor.cpp b/example/custom-executor/custom_executor.cpp
new file mode 100644
index 000000000..8dec6bf0c
--- /dev/null
+++ b/example/custom-executor/custom_executor.cpp
@@ -0,0 +1,160 @@
+//
+// Copyright (c) 2026 Mungo Gill
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Custom Executor Example
+//
+// Implements the Executor concept with a simple single-threaded
+// run loop, similar to a GUI event loop.  Shows that Capy is not
+// tied to thread_pool and can integrate with any scheduling system.
+//
+
+#include <boost/capy.hpp>
+#include <boost/capy/ex/frame_allocator.hpp>
+#include <iostream>
+#include <queue>
+#include <thread>
+#include <vector>
+
+namespace capy = boost::capy;
+
+// A minimal single-threaded execution context.
+// Demonstrates how to satisfy the Executor concept
+// for any custom scheduling system.
+class run_loop : public capy::execution_context
+{
+    std::queue<std::coroutine_handle<>> queue_;
+    std::thread::id owner_;
+
+public:
+    class executor_type;
+
+    run_loop()
+        : execution_context(this)
+    {
+    }
+
+    ~run_loop()
+    {
+        shutdown();
+        destroy();
+    }
+
+    run_loop(run_loop const&) = delete;
+    run_loop& operator=(run_loop const&) = delete;
+
+    // Drain the queue until empty
+    void run()
+    {
+        owner_ = std::this_thread::get_id();
+        while (!queue_.empty())
+        {
+            auto h = queue_.front();
+            queue_.pop();
+            capy::safe_resume(h);
+        }
+    }
+
+    void enqueue(std::coroutine_handle<> h)
+    {
+        queue_.push(h);
+    }
+
+    bool is_running_on_this_thread() const noexcept
+    {
+        return std::this_thread::get_id() == owner_;
+    }
+
+    executor_type get_executor() noexcept;
+};
+
+class run_loop::executor_type
+{
+    friend class run_loop;
+    run_loop* loop_ = nullptr;
+
+    explicit executor_type(run_loop& loop) noexcept
+        : loop_(&loop)
+    {
+    }
+
+public:
+    executor_type() = default;
+
+    capy::execution_context& context() const noexcept
+    {
+        return *loop_;
+    }
+
+    void on_work_started() const noexcept {}
+    void on_work_finished() const noexcept {}
+
+    std::coroutine_handle<> dispatch(
+        capy::continuation& c) const
+    {
+        if (loop_->is_running_on_this_thread())
+            return c.h;
+        loop_->enqueue(c.h);
+        return std::noop_coroutine();
+    }
+
+    void post(capy::continuation& c) const
+    {
+        loop_->enqueue(c.h);
+    }
+
+    bool operator==(executor_type const& other) const noexcept
+    {
+        return loop_ == other.loop_;
+    }
+};
+
+inline
+run_loop::executor_type
+run_loop::get_executor() noexcept
+{
+    return executor_type{*this};
+}
+
+// Verify the concept is satisfied
+static_assert(capy::Executor<run_loop::executor_type>);
+
+capy::io_task<int> compute(int x)
+{
+    std::cout << "  computing " << x << " * " << x << "\n";
+    co_return capy::io_result<int>{{}, x * x};
+}
+
+capy::task<> run_tasks()
+{
+    std::cout << "Launching 3 tasks with when_all...\n";
+
+    auto [ec, r1, r2, r3] = co_await capy::when_all(
+        compute(3), compute(7), compute(11));
+
+    std::cout << "\nResults: " << r1 << ", " << r2
+              << ", " << r3 << "\n";
+    std::cout << "Sum of squares: "
+              << r1 + r2 + r3 << "\n";
+}
+
+int main()
+{
+    run_loop loop;
+
+    // Launch using run_async, just like with thread_pool
+    capy::run_async(loop.get_executor())(run_tasks());
+
+    // Drive the loop — all coroutines execute here
+    std::cout << "Running event loop on main thread...\n";
+    loop.run();
+
+    std::cout << "Event loop finished.\n";
+    return 0;
+}
diff --git a/example/echo-server-corosio/echo_server.cpp b/example/echo-server-corosio/echo_server.cpp
index bc21ffe4c..2220397c6 100644
--- a/example/echo-server-corosio/echo_server.cpp
+++ b/example/echo-server-corosio/echo_server.cpp
@@ -7,138 +7,86 @@
 // Official repository: https://github.com/cppalliance/capy
 //
 
+//
+// Echo Server Example (Corosio)
+//
+// A complete echo server using Corosio for real network I/O.
+// Demonstrates Capy coroutines driving actual TCP connections.
+//
+
 #include <boost/capy.hpp>
 #include <boost/corosio.hpp>
 #include <iostream>
 
-using namespace boost::capy;
-namespace tcp = boost::corosio::tcp;
+namespace corosio = boost::corosio;
+namespace capy = boost::capy;
 
-// Echo handler: receives data and sends it back
-task<> echo_session(any_stream& stream, std::string client_info)
+capy::task<> echo_session(corosio::tcp_socket sock)
 {
-    std::cout << "[" << client_info << "] Session started\n";
-    
-    char buffer[1024];
-    std::size_t total_bytes = 0;
-    
-    try
-    {
-        for (;;)
-        {
-            // Read some data
-            // ec: std::error_code, n: std::size_t
-            auto [ec, n] = co_await stream.read_some(mutable_buffer(buffer));
-            
-            if (ec == cond::eof)
-            {
-                std::cout << "[" << client_info << "] Client disconnected\n";
-                break;
-            }
-            
-            if (ec)
-            {
-                std::cout << "[" << client_info << "] Read error: " 
-                          << ec.message() << "\n";
-                break;
-            }
-            
-            total_bytes += n;
-            
-            // Echo it back
-            // wec: std::error_code, wn: std::size_t
-            auto [wec, wn] = co_await write(stream, const_buffer(buffer, n));
-            
-            if (wec)
-            {
-                std::cout << "[" << client_info << "] Write error: " 
-                          << wec.message() << "\n";
-                break;
-            }
-        }
-    }
-    catch (std::exception const& e)
+    char buf[1024];
+
+    for (;;)
     {
-        std::cout << "[" << client_info << "] Exception: " << e.what() << "\n";
+        auto [ec, n] = co_await sock.read_some(
+            capy::mutable_buffer(buf, sizeof(buf)));
+
+        auto [wec, wn] = co_await capy::write(
+            sock, capy::const_buffer(buf, n));
+
+        if (ec)
+            break;
+
+        if (wec)
+            break;
     }
-    
-    std::cout << "[" << client_info << "] Session ended, "
-              << total_bytes << " bytes echoed\n";
+
+    sock.close();
 }
 
-// Accept loop: accepts connections and spawns handlers
-task<> accept_loop(tcp::acceptor& acceptor, executor_ref ex)
+capy::task<> accept_loop(
+    corosio::tcp_acceptor& acc,
+    corosio::io_context& ioc)
 {
-    std::cout << "Server listening on port " 
-              << acceptor.local_endpoint().port() << "\n";
-    
-    int connection_id = 0;
-    
+    auto ep = acc.local_endpoint();
+    std::cout << "Listening on port " << ep.port() << "\n";
+
     for (;;)
     {
-        // Accept a connection
-        // ec: std::error_code, socket: tcp::socket
-        auto [ec, socket] = co_await acceptor.async_accept();
-        
+        corosio::tcp_socket peer(ioc);
+        auto [ec] = co_await acc.accept(peer);
+
         if (ec)
         {
             std::cout << "Accept error: " << ec.message() << "\n";
             continue;
         }
-        
-        // Build client info string
-        auto remote = socket.remote_endpoint();  // tcp::endpoint
-        std::string client_info = 
-            std::to_string(++connection_id) + ":" +
-            remote.address().to_string() + ":" +
-            std::to_string(remote.port());
-        
-        std::cout << "[" << client_info << "] Connection accepted\n";
-        
-        // Wrap socket and spawn handler
-        // Note: socket ownership transfers to the lambda
-        run_async(ex)(
-            [](tcp::socket sock, std::string info) -> task<> {
-                any_stream stream{sock};
-                co_await echo_session(stream, std::move(info));
-            }(std::move(socket), std::move(client_info))
-        );
+
+        auto remote = peer.remote_endpoint();
+        std::cout << "Connection from ";
+        if (remote.is_v4())
+            std::cout << remote.v4_address();
+        else
+            std::cout << remote.v6_address();
+        std::cout << ":" << remote.port() << "\n";
+
+        capy::run_async(ioc.get_executor())(
+            echo_session(std::move(peer)));
     }
 }
 
 int main(int argc, char* argv[])
 {
-    try
-    {
-        // Parse port from command line
-        unsigned short port = 8080;
-        if (argc > 1)
-            port = static_cast<unsigned short>(std::stoi(argv[1]));
-        
-        // Create I/O context and thread pool
-        boost::corosio::io_context ioc;
-        thread_pool pool(4);
-        
-        // Create acceptor
-        tcp::endpoint endpoint(tcp::v4(), port);
-        tcp::acceptor acceptor(ioc, endpoint);
-        acceptor.set_option(tcp::acceptor::reuse_address(true));
-        
-        std::cout << "Starting echo server...\n";
-        
-        // Run accept loop
-        run_async(pool.get_executor())(
-            accept_loop(acceptor, pool.get_executor())
-        );
-        
-        // Run the I/O context (this blocks)
-        ioc.run();
-    }
-    catch (std::exception const& e)
-    {
-        std::cerr << "Error: " << e.what() << "\n";
-        return 1;
-    }
-    
+    unsigned short port = 8080;
+    if (argc > 1)
+        port = static_cast<unsigned short>(std::atoi(argv[1]));
+
+    corosio::io_context ioc;
+    corosio::tcp_acceptor acc(ioc, corosio::endpoint(port));
+
+    capy::run_async(ioc.get_executor())(
+        accept_loop(acc, ioc));
+
+    ioc.run();
+
     return 0;
 }
diff --git a/example/fabrics/CMakeLists.txt b/example/fabrics/CMakeLists.txt
new file mode 100644
index 000000000..ec2ed1dd8
--- /dev/null
+++ b/example/fabrics/CMakeLists.txt
@@ -0,0 +1,69 @@
+#
+# Copyright (c) 2026 Steve Gerbino
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+# Transport-neutral, non-GPU listings from P4251R0: the byte-oriented
+# compound-result pattern (capy only) and the HPC-fabric send signatures.
+# No CUDA.
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS
+    *.cpp *.hpp
+    CMakeLists.txt
+    README.md)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_fabrics ${PFILES})
+
+set_target_properties(capy_example_fabrics PROPERTIES FOLDER "examples")
+
+target_compile_features(capy_example_fabrics PRIVATE cxx_std_20)
+
+target_link_libraries(capy_example_fabrics PRIVATE Boost::capy)
+
+# HPC-fabric signature checks: each call is compiled (never run) against the real
+# library header only when that library is found, to verify the paper's
+# signature. Each library is optional and independent.
+find_path(CAPY_IBVERBS_INCLUDE_DIR infiniband/verbs.h)
+find_library(CAPY_IBVERBS_LIBRARY ibverbs)
+if(CAPY_IBVERBS_INCLUDE_DIR AND CAPY_IBVERBS_LIBRARY)
+    target_include_directories(capy_example_fabrics PRIVATE
+        ${CAPY_IBVERBS_INCLUDE_DIR})
+    target_link_libraries(capy_example_fabrics PRIVATE
+        ${CAPY_IBVERBS_LIBRARY})
+    target_compile_definitions(capy_example_fabrics PRIVATE
+        CAPY_EXAMPLE_HAS_IBVERBS=1)
+    message(STATUS "fabrics: libibverbs found; verifying ibv_post_send")
+endif()
+
+find_path(CAPY_LIBFABRIC_INCLUDE_DIR rdma/fabric.h)
+find_library(CAPY_LIBFABRIC_LIBRARY fabric)
+if(CAPY_LIBFABRIC_INCLUDE_DIR AND CAPY_LIBFABRIC_LIBRARY)
+    target_include_directories(capy_example_fabrics PRIVATE
+        ${CAPY_LIBFABRIC_INCLUDE_DIR})
+    target_link_libraries(capy_example_fabrics PRIVATE
+        ${CAPY_LIBFABRIC_LIBRARY})
+    target_compile_definitions(capy_example_fabrics PRIVATE
+        CAPY_EXAMPLE_HAS_LIBFABRIC=1)
+    message(STATUS "fabrics: libfabric found; verifying fi_send")
+endif()
+
+# UCX ships in the official 'openucx' package; it is also bundled in the
+# NVIDIA HPC SDK. Pass -DCAPY_UCX_INCLUDE_DIR/-DCAPY_UCX_LIBRARY to point at
+# a non-standard location.
+find_path(CAPY_UCX_INCLUDE_DIR ucp/api/ucp.h)
+find_library(CAPY_UCX_LIBRARY ucp)
+if(CAPY_UCX_INCLUDE_DIR AND CAPY_UCX_LIBRARY)
+    target_include_directories(capy_example_fabrics PRIVATE
+        ${CAPY_UCX_INCLUDE_DIR})
+    target_link_libraries(capy_example_fabrics PRIVATE
+        ${CAPY_UCX_LIBRARY})
+    target_compile_definitions(capy_example_fabrics PRIVATE
+        CAPY_EXAMPLE_HAS_UCX=1)
+    message(STATUS "fabrics: UCX found; verifying ucp_tag_send_nbx")
+endif()
diff --git a/example/fabrics/README.md b/example/fabrics/README.md
new file mode 100644
index 000000000..d1fda9476
--- /dev/null
+++ b/example/fabrics/README.md
@@ -0,0 +1,43 @@
+# Fabrics example (P4251R0)
+
+The transport-neutral, non-GPU listings from P4251R0 "IoAwaitables for GPU
+Data Movement". Validation that the paper's byte-oriented and
+HPC-fabric calls are type-correct against the real `boost::capy` API and the
+installed fabric libraries. Nothing here is executed; a clean build is the
+deliverable.
+
+Unlike the `cuda/` examples, this needs **no CUDA toolchain**: only a
+C++20 compiler and `boost::capy`, plus whichever fabric libraries happen to
+be installed.
+
+What is validated:
+
+- `read_with_reset`: `read_some` delivers `(error_code, n)` via structured
+  bindings; the coroutine branches on a partial-read condition with no
+  sender channel to choose. Pure capy, no transport library.
+- HPC-fabric send signatures, each built only when its library is found:
+  - libibverbs `ibv_post_send` (RDMA / InfiniBand)
+  - libfabric `fi_send` (OFI completion-queue model)
+  - UCX `ucp_tag_send_nbx` (progress-engine callback model)
+
+NCCL and NVSHMEM are the GPU members of the paper's fabric list; NCCL is
+exercised by the `cuda/datamovement` example, and NVSHMEM's device API does
+not compile under clang-cuda (see that example's notes).
+
+## Building
+
+This builds as part of the normal example set (`BOOST_CAPY_BUILD_EXAMPLES`).
+The fabric checks activate automatically when their libraries are present:
+
+- libibverbs: `libibverbs` package.
+- libfabric: `libfabric` package.
+- UCX: `openucx` package, or pass
+  `-DCAPY_UCX_INCLUDE_DIR=<dir> -DCAPY_UCX_LIBRARY=<libucp.so>` to point at a
+  non-standard location (for example the UCX bundled in the NVIDIA HPC SDK).
+
+```
+cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_STANDARD=20
+cmake --build build --config Release --target capy_example_fabrics
+```
+
+A clean build is the pass condition; the binary need not be run.
diff --git a/example/fabrics/fabrics.cpp b/example/fabrics/fabrics.cpp
new file mode 100644
index 000000000..258fadb74
--- /dev/null
+++ b/example/fabrics/fabrics.cpp
@@ -0,0 +1,106 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+// The transport-neutral, non-GPU listings from P4251R0: the byte-oriented
+// compound-result pattern (capy only) and the HPC-fabric send-call
+// signatures (libibverbs / libfabric / UCX). Nothing is executed; the
+// build itself is the check. Each fabric block builds only when found.
+
+#include <boost/capy.hpp>
+#include <boost/capy/test/stream.hpp>
+
+#include <cstddef>
+#include <system_error>
+
+namespace capy = boost::capy;
+
+namespace {
+
+// A byte-oriented read delivers (error_code, n) via structured bindings;
+// the coroutine branches on a partial-read condition (the peer reset after
+// n bytes arrived) with no sender channel to choose. The same compound
+// result is what RDMA work completions carry.
+[[maybe_unused]] capy::task<>
+read_with_reset(capy::test::stream& s)
+{
+    std::byte buf[64];
+    auto [ec, n] = co_await s.read_some(
+        capy::mutable_buffer(buf, sizeof buf));
+    if(ec == std::errc::connection_reset)
+    {
+        // 'n' bytes arrived before the reset.
+        (void) n;
+        co_return;
+    }
+    (void) n;
+}
+
+} // namespace
+
+#if defined(CAPY_EXAMPLE_HAS_IBVERBS)
+#include <infiniband/verbs.h>
+
+namespace {
+
+// libibverbs: completion via a completion-channel file descriptor.
+[[maybe_unused]] void
+sig_ibverbs()
+{
+    ibv_qp* qp = nullptr;
+    ibv_send_wr wr{};
+    ibv_send_wr* bad_wr = nullptr;
+    (void) ibv_post_send(qp, &wr, &bad_wr);
+}
+
+} // namespace
+#endif
+
+#if defined(CAPY_EXAMPLE_HAS_LIBFABRIC)
+#include <rdma/fi_endpoint.h>
+
+namespace {
+
+// libfabric: completion via a completion-queue poll.
+[[maybe_unused]] void
+sig_libfabric()
+{
+    fid_ep* ep = nullptr;
+    char buffer[16];
+    fi_addr_t dest_addr = 0;
+    void* context = nullptr;
+    (void) fi_send(ep, buffer, sizeof buffer, nullptr, dest_addr, context);
+}
+
+} // namespace
+#endif
+
+#if defined(CAPY_EXAMPLE_HAS_UCX)
+#include <ucp/api/ucp.h>
+
+namespace {
+
+// UCX: completion via a callback from the progress engine.
+[[maybe_unused]] void
+sig_ucx()
+{
+    ucp_ep_h ep = nullptr;
+    char buffer[16];
+    ucp_tag_t tag = 0;
+    ucp_request_param_t param{};
+    (void) ucp_tag_send_nbx(ep, buffer, sizeof buffer, tag, &param);
+}
+
+} // namespace
+#endif
+
+// The target exists to prove the listings are type-correct; it is not run.
+int main()
+{
+    return 0;
+}
diff --git a/example/hello-task/hello_task.cpp b/example/hello-task/hello_task.cpp
index e6aa44ced..843ca46cc 100644
--- a/example/hello-task/hello_task.cpp
+++ b/example/hello-task/hello_task.cpp
@@ -10,9 +10,9 @@
 #include <boost/capy.hpp>
 #include <iostream>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
-task<> say_hello()
+capy::task<> say_hello()
 {
     std::cout << "Hello from Capy!\n";
     co_return;
@@ -20,7 +20,7 @@ task<> say_hello()
 
 int main()
 {
-    thread_pool pool;
-    run_async(pool.get_executor())(say_hello());
+    capy::thread_pool pool;
+    capy::run_async(pool.get_executor())(say_hello());
     return 0;
 }
diff --git a/example/mock-stream-testing/mock_stream_testing.cpp b/example/mock-stream-testing/mock_stream_testing.cpp
index 89b4bb588..8f32d969d 100644
--- a/example/mock-stream-testing/mock_stream_testing.cpp
+++ b/example/mock-stream-testing/mock_stream_testing.cpp
@@ -16,32 +16,33 @@
 #include <cassert>
 #include <cctype>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 // A simple protocol: read until newline, echo back uppercase
 // Takes any_stream& so the function is transport-independent
-task<bool> echo_line_uppercase(any_stream& stream)
+capy::task<bool> echo_line_uppercase(capy::any_stream& stream)
 {
     std::string line;
     char c;
-    
+
     // Read character by character until newline
     while (true)
     {
         // ec: std::error_code, n: std::size_t
-        auto [ec, n] = co_await stream.read_some(mutable_buffer(&c, 1));
-        
-        if (ec)
+        auto [ec, n] = co_await stream.read_some(capy::mutable_buffer(&c, 1));
+
+        if (n > 0)
         {
-            if (ec == cond::eof)
+            if (c == '\n')
                 break;
-            co_return false;
+            line += static_cast<char>(std::toupper(static_cast<unsigned char>(c)));
         }
-        
-        if (c == '\n')
+
+        if (ec == capy::cond::eof)
             break;
-        
-        line += static_cast<char>(std::toupper(static_cast<unsigned char>(c)));
+
+        if (ec)
+            co_return false;
     }
     
     line += '\n';
@@ -52,12 +53,12 @@ task<bool> echo_line_uppercase(any_stream& stream)
     {
         // wec: std::error_code, wn: std::size_t
         auto [wec, wn] = co_await stream.write_some(
-            const_buffer(line.data() + written, line.size() - written));
-        
+            capy::const_buffer(line.data() + written, line.size() - written));
+
+        written += wn;
+
         if (wec)
             co_return false;
-        
-        written += wn;
     }
     
     co_return true;
@@ -67,13 +68,13 @@ void test_happy_path()
 {
     std::cout << "Test: happy path\n";
     
-    auto [a, b] = test::make_stream_pair();
+    auto [a, b] = capy::test::make_stream_pair();
     b.provide("hello\n");
-    
-    any_stream stream{&a};  // any_stream
-    
+
+    capy::any_stream stream{&a};  // any_stream
+
     bool result = false;  // bool
-    test::run_blocking([&](bool r) { result = r; })(echo_line_uppercase(stream));
+    capy::test::run_blocking([&](bool r) { result = r; })(echo_line_uppercase(stream));
     
     assert(result == true);
     assert(b.data() == "HELLO\n");
@@ -85,14 +86,14 @@ void test_partial_reads()
 {
     std::cout << "Test: partial reads (1 byte at a time)\n";
     
-    auto [a, b] = test::make_stream_pair();
+    auto [a, b] = capy::test::make_stream_pair();
     a.set_max_read_size(1);
     b.provide("hi\n");
-    
-    any_stream stream{&a};  // any_stream
-    
+
+    capy::any_stream stream{&a};  // any_stream
+
     bool result = false;  // bool
-    test::run_blocking([&](bool r) { result = r; })(echo_line_uppercase(stream));
+    capy::test::run_blocking([&](bool r) { result = r; })(echo_line_uppercase(stream));
     
     assert(result == true);
     assert(b.data() == "HI\n");
@@ -109,12 +110,12 @@ void test_with_error_injection()
     
     // fuse::armed runs the test repeatedly, failing at each
     // operation point until all paths are covered
-    test::fuse f;  // test::fuse
-    auto r = f.armed([&](test::fuse&) -> task<> {  // fuse::result
-        auto [a, b] = test::make_stream_pair(f);
+    capy::test::fuse f;  // test::fuse
+    auto r = f.armed([&](capy::test::fuse&) -> capy::task<> {  // fuse::result
+        auto [a, b] = capy::test::make_stream_pair(f);
         b.provide("test\n");
         
-        any_stream stream{&a};  // any_stream
+        capy::any_stream stream{&a};  // any_stream
         
         // Run the protocol - fuse will inject errors at each step
         bool result = co_await echo_line_uppercase(stream);  // bool
diff --git a/example/parallel-fetch/parallel_fetch.cpp b/example/parallel-fetch/parallel_fetch.cpp
index 3528714d1..34885d0cf 100644
--- a/example/parallel-fetch/parallel_fetch.cpp
+++ b/example/parallel-fetch/parallel_fetch.cpp
@@ -11,37 +11,38 @@
 #include <iostream>
 #include <latch>
 #include <string>
+#include <vector>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 // Simulated async operations
-task<int> fetch_user_id(std::string username)
+capy::task<int> fetch_user_id(std::string username)
 {
     std::cout << "Fetching user ID for: " << username << "\n";
     // In real code: co_await http_get("/users/" + username);
     co_return static_cast<int>(username.length()) * 100;  // Fake ID
 }
 
-task<std::string> fetch_user_name(int id)
+capy::task<std::string> fetch_user_name(int id)
 {
     std::cout << "Fetching name for user ID: " << id << "\n";
     co_return "User" + std::to_string(id);
 }
 
-task<int> fetch_order_count(int user_id)
+capy::task<int> fetch_order_count(int user_id)
 {
     std::cout << "Fetching order count for user: " << user_id << "\n";
     co_return user_id / 10;  // Fake count
 }
 
-task<double> fetch_account_balance(int user_id)
+capy::task<double> fetch_account_balance(int user_id)
 {
     std::cout << "Fetching balance for user: " << user_id << "\n";
     co_return user_id * 1.5;  // Fake balance
 }
 
 // Fetch all user data in parallel
-task<> fetch_user_dashboard(std::string username)
+capy::task<> fetch_user_dashboard(std::string username)
 {
     std::cout << "\n=== Fetching dashboard for: " << username << " ===\n";
     
@@ -49,15 +50,20 @@ task<> fetch_user_dashboard(std::string username)
     int user_id = co_await fetch_user_id(username);
     std::cout << "Got user ID: " << user_id << "\n\n";
     
-    // Now fetch all user data in parallel
+    // Fetch all user data in parallel using variadic when_all.
+    // Heterogeneous return types are flattened into the result.
     std::cout << "Starting parallel fetches...\n";
-    // name: std::string, orders: int, balance: double
-    auto [name, orders, balance] = co_await when_all(
-        fetch_user_name(user_id),
-        fetch_order_count(user_id),
-        fetch_account_balance(user_id)
-    );
-    
+
+    auto wrap = [](auto inner) -> capy::io_task<decltype(inner.await_resume())> {
+        co_return capy::io_result<decltype(inner.await_resume())>{
+            {}, co_await std::move(inner)};
+    };
+
+    auto [ec, name, orders, balance] = co_await capy::when_all(
+        wrap(fetch_user_name(user_id)),
+        wrap(fetch_order_count(user_id)),
+        wrap(fetch_account_balance(user_id)));
+
     std::cout << "\nDashboard results:\n";
     std::cout << "  Name: " << name << "\n";
     std::cout << "  Orders: " << orders << "\n";
@@ -65,61 +71,60 @@ task<> fetch_user_dashboard(std::string username)
 }
 
 // Example with void tasks
-task<> log_access(std::string resource)
+capy::io_task<> log_access(std::string resource)
 {
     std::cout << "Logging access to: " << resource << "\n";
-    co_return;
+    co_return capy::io_result<>{};
 }
 
-task<> update_metrics(std::string metric)
+capy::io_task<> update_metrics(std::string metric)
 {
     std::cout << "Updating metric: " << metric << "\n";
-    co_return;
+    co_return capy::io_result<>{};
 }
 
-task<std::string> fetch_with_side_effects()
+capy::task<std::string> fetch_with_side_effects()
 {
     std::cout << "\n=== Fetch with side effects ===\n";
-    
-    // void tasks don't contribute to result tuple
-    std::tuple<std::string> results = co_await when_all(
-        log_access("api/data"),           // void - no result
-        update_metrics("api_calls"),      // void - no result
-        fetch_user_name(42)               // returns string
-    );
-    std::string data = std::get<0>(results);  // std::string
-    
+
+    auto r = co_await capy::when_all(
+        log_access("api/data"),
+        update_metrics("api_calls"));
+    if (r.ec)
+        co_return "error";
+
+    auto data = co_await fetch_user_name(42);
+
     std::cout << "Data: " << data << "\n";
     co_return data;
 }
 
 // Error handling example
-task<int> might_fail(bool should_fail, std::string name)
+capy::io_task<int> might_fail(bool should_fail, std::string name)
 {
     std::cout << "Task " << name << " starting\n";
-    
+
     if (should_fail)
     {
         throw std::runtime_error(name + " failed!");
     }
-    
+
     std::cout << "Task " << name << " completed\n";
-    co_return 42;
+    co_return capy::io_result<int>{{}, 42};
 }
 
-task<> demonstrate_error_handling()
+capy::task<> demonstrate_error_handling()
 {
     std::cout << "\n=== Error handling ===\n";
     
     try
     {
-        // a: int, b: int, c: int
-        auto [a, b, c] = co_await when_all(
+        auto [ec2, a, b, c] = co_await capy::when_all(
             might_fail(false, "A"),
             might_fail(true, "B"),   // This one fails
-            might_fail(false, "C")
-        );
-        std::cout << "All succeeded: " << a << ", " << b << ", " << c << "\n";
+            might_fail(false, "C"));
+        std::cout << "All succeeded: " << a << ", "
+                  << b << ", " << c << "\n";
     }
     catch (std::runtime_error const& e)
     {
@@ -131,17 +136,17 @@ task<> demonstrate_error_handling()
 
 int main()
 {
-    thread_pool pool;
+    capy::thread_pool pool;
     std::latch done(3);  // std::latch - wait for 3 tasks
-    
+
     // Completion handlers signal the latch when each task finishes
     // Use generic lambda to accept any result type (or no result for task<void>)
     auto on_complete = [&done](auto&&...) { done.count_down(); };
     auto on_error = [&done](std::exception_ptr) { done.count_down(); };
-    
-    run_async(pool.get_executor(), on_complete, on_error)(fetch_user_dashboard("alice"));
-    run_async(pool.get_executor(), on_complete, on_error)(fetch_with_side_effects());
-    run_async(pool.get_executor(), on_complete, on_error)(demonstrate_error_handling());
+
+    capy::run_async(pool.get_executor(), on_complete, on_error)(fetch_user_dashboard("alice"));
+    capy::run_async(pool.get_executor(), on_complete, on_error)(fetch_with_side_effects());
+    capy::run_async(pool.get_executor(), on_complete, on_error)(demonstrate_error_handling());
     
     done.wait();  // Block until all tasks complete
     return 0;
diff --git a/example/parallel-tasks/CMakeLists.txt b/example/parallel-tasks/CMakeLists.txt
new file mode 100644
index 000000000..d9748105c
--- /dev/null
+++ b/example/parallel-tasks/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2026 Mungo Gill
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS *.cpp *.hpp
+    CMakeLists.txt
+    Jamfile)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_parallel_tasks ${PFILES})
+
+set_property(TARGET capy_example_parallel_tasks
+    PROPERTY FOLDER "examples")
+
+target_link_libraries(capy_example_parallel_tasks
+    Boost::capy)
diff --git a/example/parallel-tasks/parallel_tasks.cpp b/example/parallel-tasks/parallel_tasks.cpp
new file mode 100644
index 000000000..0765d97d3
--- /dev/null
+++ b/example/parallel-tasks/parallel_tasks.cpp
@@ -0,0 +1,94 @@
+//
+// Copyright (c) 2026 Mungo Gill
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Parallel Tasks Example
+//
+// Distributes CPU-bound work across a thread_pool and collects
+// results with when_all.  Each task sums a range of integers
+// and prints its thread ID to show parallel execution.
+//
+
+#include <boost/capy.hpp>
+#include <iostream>
+#include <latch>
+#include <sstream>
+#include <thread>
+#include <vector>
+
+namespace capy = boost::capy;
+
+// Sum integers in [lo, hi)
+capy::io_task<long long> partial_sum(int lo, int hi)
+{
+    std::ostringstream oss;
+    oss << "  range [" << lo << ", " << hi
+        << ") on thread " << std::this_thread::get_id() << "\n";
+    std::cout << oss.str();
+
+    long long sum = 0;
+    for (int i = lo; i < hi; ++i)
+        sum += i;
+    co_return capy::io_result<long long>{{}, sum};
+}
+
+int main()
+{
+    constexpr int total = 10000;
+    constexpr int num_tasks = 4;
+    constexpr int chunk = total / num_tasks;
+
+    capy::thread_pool pool(num_tasks);
+    std::latch done(1);
+
+    auto on_complete = [&done](auto&&...) { done.count_down(); };
+    auto on_error = [&done](std::exception_ptr ep) {
+        try { std::rethrow_exception(ep); }
+        catch (std::exception const& e) {
+            std::cerr << "Error: " << e.what() << "\n";
+        }
+        catch (...) {
+            std::cerr << "Error: unknown exception\n";
+        }
+        done.count_down();
+    };
+
+    auto compute = [&]() -> capy::task<> {
+        std::cout << "Dispatching " << num_tasks
+                  << " parallel tasks...\n";
+
+        std::vector<capy::io_task<long long>> tasks;
+        for (int i = 0; i < num_tasks; ++i)
+            tasks.push_back(partial_sum(i * chunk, (i + 1) * chunk));
+
+        auto [ec, sums] = co_await capy::when_all(std::move(tasks));
+
+        long long total_sum = 0;
+        for (auto s : sums)
+            total_sum += s;
+
+        // Arithmetic series: sum [0, N) = N*(N-1)/2
+        long long expected =
+            static_cast<long long>(total) * (total - 1) / 2;
+
+        std::cout << "\nPartial sums:";
+        for (std::size_t i = 0; i < sums.size(); ++i)
+        {
+            if (i > 0) std::cout << " +";
+            std::cout << " " << sums[i];
+        }
+        std::cout << "\nTotal: " << total_sum
+                  << " (expected " << expected << ")\n";
+    };
+
+    capy::run_async(pool.get_executor(), on_complete, on_error)(compute());
+    done.wait();
+
+    return 0;
+}
diff --git a/example/producer-consumer/producer_consumer.cpp b/example/producer-consumer/producer_consumer.cpp
index 5dca0e523..eb67e9134 100644
--- a/example/producer-consumer/producer_consumer.cpp
+++ b/example/producer-consumer/producer_consumer.cpp
@@ -19,44 +19,44 @@
 #include <iostream>
 #include <latch>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 int main()
 {
-    thread_pool pool;  // thread_pool
-    strand s{pool.get_executor()};  // strand - serializes execution
-    std::latch done(1);  // std::latch - wait for completion
+    capy::thread_pool pool;
+    capy::strand s{pool.get_executor()};
+    std::latch done(1);
 
-    auto on_complete = [&done](auto&&...) { done.count_down(); };  // lambda
-    auto on_error = [&done](std::exception_ptr) { done.count_down(); };  // lambda
+    auto on_complete = [&done](auto&&...) { done.count_down(); };
+    auto on_error = [&done](std::exception_ptr) { done.count_down(); };
 
-    async_event data_ready;  // async_event
-    int shared_value = 0;    // int
+    capy::async_event data_ready;
+    int shared_value = 0;
 
-    auto producer = [&]() -> task<> {
+    auto producer = [&]() -> capy::io_task<> {
         std::cout << "Producer: preparing data...\n";
         shared_value = 42;
         std::cout << "Producer: data ready, signaling\n";
         data_ready.set();
-        co_return;
+        co_return capy::io_result<>{};
     };
 
-    auto consumer = [&]() -> task<> {
+    auto consumer = [&]() -> capy::io_task<> {
         std::cout << "Consumer: waiting for data...\n";
         auto [ec] = co_await data_ready.wait();
         (void)ec;
         std::cout << "Consumer: received value " << shared_value << "\n";
-        co_return;
+        co_return capy::io_result<>{};
     };
 
     // Run both tasks concurrently using when_all, through a strand.
     // The strand serializes execution, ensuring thread-safe access
     // to the shared async_event and shared_value.
-    auto run_both = [&]() -> task<> {
-        co_await when_all(producer(), consumer());
+    auto run_both = [&]() -> capy::task<> {
+        (void) co_await capy::when_all(producer(), consumer());
     };
 
-    run_async(s, on_complete, on_error)(run_both());
+    capy::run_async(s, on_complete, on_error)(run_both());
 
     done.wait();  // Block until tasks complete
     return 0;
diff --git a/example/quitter-shutdown/CMakeLists.txt b/example/quitter-shutdown/CMakeLists.txt
new file mode 100644
index 000000000..14d6f164d
--- /dev/null
+++ b/example/quitter-shutdown/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2026 Michael Vandeberg
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS *.cpp *.hpp
+    CMakeLists.txt
+    Jamfile)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_quitter_shutdown ${PFILES})
+
+set_property(TARGET capy_example_quitter_shutdown
+    PROPERTY FOLDER "examples")
+
+target_link_libraries(capy_example_quitter_shutdown
+    Boost::capy)
diff --git a/example/quitter-shutdown/quitter_shutdown.cpp b/example/quitter-shutdown/quitter_shutdown.cpp
new file mode 100644
index 000000000..ef77f6e29
--- /dev/null
+++ b/example/quitter-shutdown/quitter_shutdown.cpp
@@ -0,0 +1,165 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+/* Quitter Shutdown Example
+
+   Demonstrates quitter<T> for responsive application shutdown.
+
+   Four workers simulate a batch file-processing pipeline: each
+   "downloads" data (delay), "transforms" it, and "writes" the
+   result (delay).  Workers are quitter<> coroutines — their
+   bodies contain zero cancellation-handling code.
+
+   Press Ctrl+C to request shutdown.  Every in-flight worker
+   exits at its next co_await, RAII cleanup runs (each worker
+   holds a resource_guard that logs its cleanup), and the
+   application prints a summary and exits.
+
+   Contrast with task<>:
+     With task<>, every co_await that touches I/O needs:
+       auto [ec] = co_await delay(dur);
+       if(ec) co_return;            // <-- cancellation boilerplate
+     This is repeated at every suspension point.
+
+     With quitter<>, the promise intercepts the stop token
+     automatically.  The worker body is pure business logic.
+*/
+
+#include <boost/capy.hpp>
+
+#include <atomic>
+#include <chrono>
+#include <csignal>
+#include <iostream>
+#include <latch>
+#include <sstream>
+#include <stop_token>
+
+namespace capy = boost::capy;
+using namespace std::chrono_literals;
+
+// Global stop source wired to Ctrl+C.
+static std::stop_source g_stop;
+static std::atomic<std::chrono::steady_clock::time_point>
+    g_stop_time{std::chrono::steady_clock::time_point{}};
+
+extern "C" void signal_handler(int)
+{
+    g_stop_time.store(std::chrono::steady_clock::now(),
+        std::memory_order_relaxed);
+    g_stop.request_stop();
+}
+
+// RAII resource that logs construction and destruction.
+// Simulates holding a file handle, socket, or temp buffer
+// that must be released on shutdown.
+struct resource_guard
+{
+    int id;
+    std::atomic<int>& cleanup_count;
+
+    resource_guard(int id_, std::atomic<int>& count)
+        : id(id_)
+        , cleanup_count(count)
+    {
+        std::ostringstream oss;
+        oss << "  [worker " << id << "] acquired resources\n";
+        std::cout << oss.str();
+    }
+
+    ~resource_guard()
+    {
+        ++cleanup_count;
+        std::ostringstream oss;
+        oss << "  [worker " << id << "] released resources "
+            << "(cleanup)\n";
+        std::cout << oss.str();
+    }
+
+    resource_guard(resource_guard const&) = delete;
+    resource_guard& operator=(resource_guard const&) = delete;
+};
+
+// A single worker: download → transform → write, repeated.
+// No cancellation code.  quitter handles it.
+capy::quitter<> worker(
+    int id,
+    std::atomic<int>& items_processed,
+    std::atomic<int>& cleanup_count)
+{
+    resource_guard guard(id, cleanup_count);
+
+    for(int item = 0; ; ++item)
+    {
+        // Simulate download (200-400ms depending on worker)
+        auto download_time = 200ms + 50ms * id;
+        (void) co_await capy::delay(download_time);
+
+        // Simulate transform (CPU work — no co_await needed)
+        {
+            std::ostringstream oss;
+            oss << "  [worker " << id << "] processing item "
+                << item << "\n";
+            std::cout << oss.str();
+        }
+
+        // Simulate write (100ms)
+        (void) co_await capy::delay(100ms);
+
+        ++items_processed;
+    }
+
+    // Never reached — the loop is infinite.
+    // quitter exits at the next co_await after stop is requested.
+}
+
+int main()
+{
+    std::signal(SIGINT, signal_handler);
+#ifdef SIGTERM
+    std::signal(SIGTERM, signal_handler);
+#endif
+
+    constexpr int num_workers = 4;
+    capy::thread_pool pool(num_workers);
+    std::latch done(num_workers);
+
+    std::atomic<int> items_processed{0};
+    std::atomic<int> cleanup_count{0};
+
+    std::cout << "Starting " << num_workers
+              << " workers.  Press Ctrl+C to quit.\n\n";
+
+    for(int i = 0; i < num_workers; ++i)
+    {
+        capy::run_async(
+            pool.get_executor(),
+            g_stop.get_token(),
+            [&]() { done.count_down(); },
+            [&](std::exception_ptr) { done.count_down(); })(
+                worker(i, items_processed, cleanup_count));
+    }
+
+    done.wait();
+
+    auto stop_at = g_stop_time.load(std::memory_order_relaxed);
+    auto now = std::chrono::steady_clock::now();
+
+    std::cout << "\nShutdown complete.\n"
+              << "  Items processed: " << items_processed << "\n"
+              << "  Workers cleaned up: " << cleanup_count
+              << "/" << num_workers << "\n";
+
+    if(stop_at != std::chrono::steady_clock::time_point{})
+    {
+        auto us = std::chrono::duration_cast<
+            std::chrono::microseconds>(now - stop_at).count();
+        std::cout << "  Shutdown latency: " << us << " us\n";
+    }
+}
diff --git a/example/sender-bridge/CMakeLists.txt b/example/sender-bridge/CMakeLists.txt
new file mode 100644
index 000000000..3d7e77fd6
--- /dev/null
+++ b/example/sender-bridge/CMakeLists.txt
@@ -0,0 +1,36 @@
+#
+# Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+include(FetchContent)
+
+FetchContent_Declare(
+    execution
+    GIT_REPOSITORY https://github.com/bemanproject/execution.git
+    GIT_TAG main
+    SYSTEM
+)
+FetchContent_MakeAvailable(execution)
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS *.cpp *.hpp
+    CMakeLists.txt
+    Jamfile)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_sender_bridge ${PFILES})
+
+set_property(TARGET capy_example_sender_bridge
+    PROPERTY FOLDER "examples")
+
+target_compile_features(capy_example_sender_bridge
+    PRIVATE cxx_std_23)
+
+target_link_libraries(capy_example_sender_bridge
+    Boost::capy
+    beman::execution_headers)
diff --git a/example/sender-bridge/sender_awaitable.hpp b/example/sender-bridge/sender_awaitable.hpp
new file mode 100644
index 000000000..11ae8f62c
--- /dev/null
+++ b/example/sender-bridge/sender_awaitable.hpp
@@ -0,0 +1,405 @@
+//
+// Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_EXAMPLE_SENDER_AWAITABLE_HPP
+#define BOOST_CAPY_EXAMPLE_SENDER_AWAITABLE_HPP
+
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/error.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <beman/execution/execution.hpp>
+
+#include <coroutine>
+#include <exception>
+#include <new>
+#include <stop_token>
+#include <system_error>
+#include <tuple>
+#include <type_traits>
+#include <variant>
+
+namespace boost::capy {
+
+namespace detail {
+
+struct stopped_t {};
+
+struct operation_cancelled {};
+
+struct bridge_env
+{
+    std::stop_token st_;
+
+    auto query(
+        beman::execution::get_stop_token_t const&)
+            const noexcept
+    {
+        return st_;
+    }
+};
+
+template<class Sender>
+using sender_single_value_t =
+    beman::execution::value_types_of_t<
+        Sender,
+        bridge_env,
+        std::tuple,
+        std::type_identity_t>;
+
+// Detect whether a sender can complete with
+// set_error(std::error_code).
+template<class Sender>
+struct has_error_code_completion
+{
+    template<class... Es>
+    struct checker
+    {
+        static constexpr bool value =
+            (std::is_same_v<
+                Es, std::error_code> || ...);
+    };
+
+    static constexpr bool value =
+        beman::execution::error_types_of_t<
+            Sender,
+            bridge_env,
+            checker>::value;
+};
+
+template<class Sender>
+constexpr bool has_error_code_v =
+    has_error_code_completion<Sender>::value;
+
+// Variant when sender can complete with
+// set_error(error_code): separate slot so
+// error_code is not wrapped in exception_ptr.
+template<class ValueTuple>
+using ec_result_variant = std::variant<
+    std::monostate,
+    ValueTuple,
+    std::error_code,
+    std::exception_ptr,
+    stopped_t>;
+
+// Variant when sender does not complete with
+// set_error(error_code).
+template<class ValueTuple>
+using no_ec_result_variant = std::variant<
+    std::monostate,
+    ValueTuple,
+    std::exception_ptr,
+    stopped_t>;
+
+template<class ValueTuple, bool HasEc>
+using result_variant = std::conditional_t<
+    HasEc,
+    ec_result_variant<ValueTuple>,
+    no_ec_result_variant<ValueTuple>>;
+
+// Bridge receiver that stores the sender's
+// completion result and posts the coroutine
+// handle back through the Capy executor.
+template<class ValueTuple, bool HasEc>
+struct bridge_receiver
+{
+    using receiver_concept =
+        beman::execution::receiver_t;
+
+    result_variant<ValueTuple, HasEc>* result_;
+    continuation                       cont_;
+    io_env const*                      env_;
+
+    auto get_env() const noexcept -> bridge_env
+    {
+        return {env_->stop_token};
+    }
+
+    template<class... Args>
+    void set_value(Args&&... args) && noexcept
+    {
+        result_->template emplace<1>(
+            std::forward<Args>(args)...);
+        env_->executor.post(cont_);
+    }
+
+    template<class E>
+    void set_error(E&& e) && noexcept
+    {
+        if constexpr (
+            HasEc &&
+            std::is_same_v<
+                std::decay_t<E>,
+                std::error_code>)
+            result_->template emplace<2>(
+                std::forward<E>(e));
+        else if constexpr (
+            std::is_same_v<
+                std::decay_t<E>,
+                std::exception_ptr>)
+        {
+            constexpr auto idx = HasEc ? 3 : 2;
+            result_->template emplace<idx>(
+                std::forward<E>(e));
+        }
+        else
+        {
+            constexpr auto idx = HasEc ? 3 : 2;
+            result_->template emplace<idx>(
+                std::make_exception_ptr(
+                    std::forward<E>(e)));
+        }
+        env_->executor.post(cont_);
+    }
+
+    void set_stopped() && noexcept
+    {
+        constexpr auto idx = HasEc ? 4 : 3;
+        result_->template emplace<idx>(
+            stopped_t{});
+        env_->executor.post(cont_);
+    }
+};
+
+} // namespace detail
+
+/** Awaitable that bridges a beman::execution
+    sender into a Capy coroutine.
+
+    Satisfies IoAwaitable. When co_awaited inside
+    a capy::task, connects the sender to a bridge
+    receiver, starts the operation, and resumes
+    the coroutine on the caller's executor when
+    the sender completes.
+
+    The bridge inspects the sender's error
+    completion signatures at compile time. If the
+    sender can complete with
+    set_error(std::error_code), await_resume
+    returns io_result so the error code is a
+    value, not an exception. Otherwise
+    await_resume returns the value directly and
+    genuine exceptions are rethrown.
+
+    @tparam Sender The beman::execution sender
+        type.
+*/
+template<class Sender>
+struct [[nodiscard]] sender_awaitable
+{
+    static constexpr bool has_ec =
+        detail::has_error_code_v<Sender>;
+
+    using value_tuple =
+        detail::sender_single_value_t<Sender>;
+    using variant_type =
+        detail::result_variant<
+            value_tuple, has_ec>;
+    using receiver_type =
+        detail::bridge_receiver<
+            value_tuple, has_ec>;
+    using op_state_type = decltype(
+        beman::execution::connect(
+            std::declval<Sender>(),
+            std::declval<receiver_type>()));
+
+    Sender sndr_;
+    variant_type result_{};
+
+    alignas(op_state_type)
+    unsigned char op_buf_[sizeof(op_state_type)];
+    bool op_constructed_ = false;
+
+    explicit sender_awaitable(Sender sndr)
+        : sndr_(std::move(sndr))
+    {
+    }
+
+    sender_awaitable(sender_awaitable&& o)
+        noexcept(
+            std::is_nothrow_move_constructible_v<
+                Sender>)
+        : sndr_(std::move(o.sndr_))
+    {
+    }
+
+    sender_awaitable(
+        sender_awaitable const&) = delete;
+    sender_awaitable& operator=(
+        sender_awaitable const&) = delete;
+    sender_awaitable& operator=(
+        sender_awaitable&&) = delete;
+
+    ~sender_awaitable()
+    {
+        if(op_constructed_)
+            std::launder(
+                reinterpret_cast<op_state_type*>(
+                    op_buf_))->~op_state_type();
+    }
+
+    bool await_ready() const noexcept
+    {
+        return false;
+    }
+
+    std::coroutine_handle<>
+    await_suspend(
+        std::coroutine_handle<> h,
+        io_env const* env)
+    {
+        ::new(op_buf_) op_state_type(
+            beman::execution::connect(
+                std::move(sndr_),
+                receiver_type{
+                    &result_, {h}, env}));
+        op_constructed_ = true;
+        beman::execution::start(
+            *std::launder(
+                reinterpret_cast<
+                    op_state_type*>(
+                        op_buf_)));
+        return std::noop_coroutine();
+    }
+
+    auto await_resume()
+    {
+        if constexpr (has_ec)
+            return await_resume_ec();
+        else
+            return await_resume_no_ec();
+    }
+
+private:
+    // Sender can complete with
+    // set_error(error_code). Return io_result
+    // so the error code is a value, not an
+    // exception.
+    auto await_resume_ec()
+    {
+        // exception_ptr at index 3
+        if(result_.index() == 3)
+            std::rethrow_exception(
+                std::get<3>(result_));
+
+        if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 0)
+        {
+            // stopped at index 4
+            if(result_.index() == 4)
+                return io_result<>{
+                    make_error_code(
+                        error::canceled)};
+            if(result_.index() == 2)
+                return io_result<>{
+                    std::get<2>(result_)};
+            return io_result<>{};
+        }
+        else if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 1)
+        {
+            using T = std::tuple_element_t<
+                0, value_tuple>;
+            if(result_.index() == 4)
+                return io_result<T>{
+                    make_error_code(
+                        error::canceled)};
+            if(result_.index() == 2)
+                return io_result<T>{
+                    std::get<2>(result_)};
+            return io_result<T>{
+                {},
+                std::get<0>(
+                    std::get<1>(
+                        std::move(result_)))};
+        }
+        else
+        {
+            if(result_.index() == 4)
+                return io_result<value_tuple>{
+                    make_error_code(
+                        error::canceled)};
+            if(result_.index() == 2)
+                return io_result<value_tuple>{
+                    std::get<2>(result_)};
+            return io_result<value_tuple>{
+                {},
+                std::get<1>(
+                    std::move(result_))};
+        }
+    }
+
+    // Sender does not complete with
+    // set_error(error_code). Return the value
+    // directly; rethrow exceptions.
+    auto await_resume_no_ec()
+    {
+        // exception_ptr at index 2
+        if(result_.index() == 2)
+            std::rethrow_exception(
+                std::get<2>(result_));
+        // stopped at index 3
+        if(result_.index() == 3)
+            throw detail::operation_cancelled{};
+
+        if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 0)
+            return;
+        else if constexpr (
+            std::tuple_size_v<
+                value_tuple> == 1)
+            return std::get<0>(
+                std::get<1>(
+                    std::move(result_)));
+        else
+            return std::get<1>(
+                std::move(result_));
+    }
+};
+
+/** Create an IoAwaitable from a
+    beman::execution sender.
+
+    If the sender can complete with
+    set_error(std::error_code), the returned
+    awaitable yields io_result so the error code
+    is a value, not an exception. Otherwise the
+    awaitable yields the value directly.
+
+    @par Example
+    @code
+    capy::task<int> compute(auto sched)
+    {
+        auto result = co_await await_sender(
+            beman::execution::schedule(sched)
+                | beman::execution::then(
+                    [] { return 42; }));
+        co_return result;
+    }
+    @endcode
+
+    @param sndr The sender to bridge.
+    @return An IoAwaitable that can be co_awaited
+        in a capy::task.
+*/
+template<class Sender>
+auto await_sender(Sender&& sndr)
+{
+    return sender_awaitable<
+        std::decay_t<Sender>>(
+            std::forward<Sender>(sndr));
+}
+
+} // namespace boost::capy
+
+#endif
diff --git a/example/sender-bridge/sender_bridge.cpp b/example/sender-bridge/sender_bridge.cpp
new file mode 100644
index 000000000..cf65267fe
--- /dev/null
+++ b/example/sender-bridge/sender_bridge.cpp
@@ -0,0 +1,82 @@
+//
+// Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#include "sender_awaitable.hpp"
+
+#include <boost/capy.hpp>
+
+#include <beman/execution/execution.hpp>
+
+#include <iostream>
+#include <latch>
+#include <thread>
+
+namespace capy = boost::capy;
+namespace ex = beman::execution;
+
+capy::task<int> compute(auto sched)
+{
+    int result = co_await capy::await_sender(
+        ex::schedule(sched)
+            | ex::then([] {
+                std::cout
+                    << "  sender running on thread "
+                    << std::this_thread::get_id() << "\n";
+                return 42 * 42;
+            }));
+
+    std::cout
+        << "  coroutine resumed on thread "
+        << std::this_thread::get_id() << "\n";
+
+    co_return result;
+}
+
+int main()
+{
+    std::cout
+        << "main thread: "
+        << std::this_thread::get_id() << "\n";
+
+    // Capy execution context
+    capy::thread_pool pool;
+
+    // Beman execution context (run_loop on a dedicated thread)
+    ex::run_loop loop;
+    std::jthread loop_thread([&loop] {
+        loop.run();
+    });
+    auto sched = loop.get_scheduler();
+
+    std::latch done(1);
+    int answer = 0;
+
+    auto on_complete = [&](int v) {
+        answer = v;
+        done.count_down();
+    };
+
+    auto on_error = [&](std::exception_ptr ep) {
+        try { std::rethrow_exception(ep); }
+        catch (std::exception const& e) {
+            std::cerr << "error: " << e.what() << "\n";
+        }
+        done.count_down();
+    };
+
+    capy::run_async(
+        pool.get_executor(),
+        on_complete,
+        on_error)(compute(sched));
+
+    done.wait();
+    loop.finish();
+
+    std::cout << "result: " << answer << "\n";
+}
diff --git a/example/strand-serialization/CMakeLists.txt b/example/strand-serialization/CMakeLists.txt
new file mode 100644
index 000000000..bda2a696a
--- /dev/null
+++ b/example/strand-serialization/CMakeLists.txt
@@ -0,0 +1,22 @@
+#
+# Copyright (c) 2026 Mungo Gill
+#
+# Distributed under the Boost Software License, Version 1.0. (See accompanying
+# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+#
+# Official repository: https://github.com/cppalliance/capy
+#
+
+file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS *.cpp *.hpp
+    CMakeLists.txt
+    Jamfile)
+
+source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
+
+add_executable(capy_example_strand_serialization ${PFILES})
+
+set_property(TARGET capy_example_strand_serialization
+    PROPERTY FOLDER "examples")
+
+target_link_libraries(capy_example_strand_serialization
+    Boost::capy)
diff --git a/example/strand-serialization/strand_serialization.cpp b/example/strand-serialization/strand_serialization.cpp
new file mode 100644
index 000000000..930a32b8d
--- /dev/null
+++ b/example/strand-serialization/strand_serialization.cpp
@@ -0,0 +1,73 @@
+//
+// Copyright (c) 2026 Mungo Gill
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+//
+// Strand Serialization Example
+//
+// Demonstrates protecting shared state with a strand instead of a mutex.
+// Multiple coroutines increment a shared counter concurrently on a
+// multi-threaded thread_pool; the strand guarantees serialized access.
+//
+
+#include <boost/capy.hpp>
+#include <iostream>
+#include <latch>
+#include <vector>
+
+namespace capy = boost::capy;
+
+int main()
+{
+    constexpr int num_coroutines = 10;
+    constexpr int increments_per_coro = 1000;
+
+    capy::thread_pool pool(4);
+    capy::strand s{pool.get_executor()};
+    std::latch done(1);
+
+    auto on_complete = [&done](auto&&...) { done.count_down(); };
+    auto on_error = [&done](std::exception_ptr ep) {
+        try { std::rethrow_exception(ep); }
+        catch (std::exception const& e) {
+            std::cerr << "Error: " << e.what() << "\n";
+        }
+        catch (...) {
+            std::cerr << "Error: unknown exception\n";
+        }
+        done.count_down();
+    };
+
+    int counter = 0;
+
+    // Each coroutine increments the shared counter without locks.
+    // The strand ensures only one coroutine runs at a time.
+    auto increment = [&](int id) -> capy::io_task<> {
+        for (int i = 0; i < increments_per_coro; ++i)
+            ++counter;
+        std::cout << "Coroutine " << id
+                  << " finished, counter = " << counter << "\n";
+        co_return capy::io_result<>{};
+    };
+
+    auto run_all = [&]() -> capy::task<> {
+        std::vector<capy::io_task<>> tasks;
+        for (int i = 0; i < num_coroutines; ++i)
+            tasks.push_back(increment(i));
+        (void) co_await capy::when_all(std::move(tasks));
+    };
+
+    capy::run_async(s, on_complete, on_error)(run_all());
+    done.wait();
+
+    int expected = num_coroutines * increments_per_coro;
+    std::cout << "\nFinal counter: " << counter
+              << " (expected " << expected << ")\n";
+
+    return 0;
+}
diff --git a/example/stream-pipeline/stream_pipeline.cpp b/example/stream-pipeline/stream_pipeline.cpp
index 3b8169235..a81d1b1a3 100644
--- a/example/stream-pipeline/stream_pipeline.cpp
+++ b/example/stream-pipeline/stream_pipeline.cpp
@@ -33,7 +33,7 @@
 #include <cctype>
 #include <system_error>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 //------------------------------------------------------------------------------
 //
@@ -46,13 +46,13 @@ using namespace boost::capy;
 
 class uppercase_transform
 {
-    any_buffer_source* source_;  // any_buffer_source*
+    capy::any_buffer_source* source_;  // any_buffer_source*
     std::vector<char> buffer_;   // std::vector<char> - transformed data
     std::size_t consumed_ = 0;   // std::size_t - bytes consumed by downstream
     bool exhausted_ = false;     // bool - upstream exhausted
-    
+
 public:
-    explicit uppercase_transform(any_buffer_source& source)
+    explicit uppercase_transform(capy::any_buffer_source& source)
         : source_(&source)
     {
     }
@@ -71,63 +71,63 @@ class uppercase_transform
     }
     
     // BufferSource::pull - returns task<> to enable co_await on upstream
-    io_task<std::span<const_buffer>>
-    pull(std::span<const_buffer> dest)
+    capy::io_task<std::span<capy::const_buffer>>
+    pull(std::span<capy::const_buffer> dest)
     {
         // Already have unconsumed data?
         if (consumed_ < buffer_.size())
         {
             if (dest.empty())
-                co_return {std::error_code{}, std::span<const_buffer>{}};
-            
-            dest[0] = const_buffer(
+                co_return {std::error_code{}, std::span<capy::const_buffer>{}};
+
+            dest[0] = capy::const_buffer(
                 buffer_.data() + consumed_,
                 buffer_.size() - consumed_);
             co_return {std::error_code{}, dest.first(1)};
         }
-        
+
         // Upstream exhausted?
         if (exhausted_)
-            co_return {error::eof, std::span<const_buffer>{}};
-        
+            co_return {capy::error::eof, std::span<capy::const_buffer>{}};
+
         // Pull from upstream
         buffer_.clear();
         consumed_ = 0;
-        
-        const_buffer upstream[8];  // const_buffer[8]
+
+        capy::const_buffer upstream[8];  // const_buffer[8]
         // ec: std::error_code, bufs: std::span<const_buffer>
         auto [ec, bufs] = co_await source_->pull(upstream);
-        
-        if (ec == cond::eof)
+
+        if (ec == capy::cond::eof)
         {
             exhausted_ = true;
-            co_return {error::eof, std::span<const_buffer>{}};
+            co_return {capy::error::eof, std::span<capy::const_buffer>{}};
         }
 
         if (ec)
-            co_return {ec, std::span<const_buffer>{}};
-        
+            co_return {ec, std::span<capy::const_buffer>{}};
+
         // Transform: uppercase each byte
         for (auto const& buf : bufs)  // const_buffer const&
         {
             auto const* data = static_cast<char const*>(buf.data());  // char const*
             auto size = buf.size();  // std::size_t
-            
+
             for (std::size_t i = 0; i < size; ++i)
             {
                 buffer_.push_back(static_cast<char>(
                     std::toupper(static_cast<unsigned char>(data[i]))));
             }
         }
-        
+
         // Consume from upstream
-        source_->consume(buffer_size(bufs));
-        
+        source_->consume(capy::buffer_size(bufs));
+
         // Return transformed data
         if (dest.empty() || buffer_.empty())
-            co_return {std::error_code{}, std::span<const_buffer>{}};
-        
-        dest[0] = const_buffer(buffer_.data(), buffer_.size());
+            co_return {std::error_code{}, std::span<capy::const_buffer>{}};
+
+        dest[0] = capy::const_buffer(buffer_.data(), buffer_.size());
         co_return {std::error_code{}, dest.first(1)};
     }
 };
@@ -143,15 +143,15 @@ class uppercase_transform
 
 class line_numbering_transform
 {
-    any_buffer_source* source_;  // any_buffer_source*
+    capy::any_buffer_source* source_;  // any_buffer_source*
     std::string buffer_;         // std::string - transformed data
     std::size_t consumed_ = 0;   // std::size_t - bytes consumed by downstream
     std::size_t line_num_ = 1;   // std::size_t - current line number
     bool at_line_start_ = true;  // bool - are we at start of a line?
     bool exhausted_ = false;     // bool - upstream exhausted
-    
+
 public:
-    explicit line_numbering_transform(any_buffer_source& source)
+    explicit line_numbering_transform(capy::any_buffer_source& source)
         : source_(&source)
     {
     }
@@ -170,48 +170,48 @@ class line_numbering_transform
     }
     
     // BufferSource::pull - returns task<> to enable co_await on upstream
-    io_task<std::span<const_buffer>>
-    pull(std::span<const_buffer> dest)
+    capy::io_task<std::span<capy::const_buffer>>
+    pull(std::span<capy::const_buffer> dest)
     {
         // Already have unconsumed data?
         if (consumed_ < buffer_.size())
         {
             if (dest.empty())
-                co_return {std::error_code{}, std::span<const_buffer>{}};
-            
-            dest[0] = const_buffer(
+                co_return {std::error_code{}, std::span<capy::const_buffer>{}};
+
+            dest[0] = capy::const_buffer(
                 buffer_.data() + consumed_,
                 buffer_.size() - consumed_);
             co_return {std::error_code{}, dest.first(1)};
         }
-        
+
         // Upstream exhausted?
         if (exhausted_)
-            co_return {error::eof, std::span<const_buffer>{}};
-        
+            co_return {capy::error::eof, std::span<capy::const_buffer>{}};
+
         // Pull from upstream
         buffer_.clear();
         consumed_ = 0;
-        
-        const_buffer upstream[8];  // const_buffer[8]
+
+        capy::const_buffer upstream[8];  // const_buffer[8]
         // ec: std::error_code, bufs: std::span<const_buffer>
         auto [ec, bufs] = co_await source_->pull(upstream);
-        
-        if (ec == cond::eof)
+
+        if (ec == capy::cond::eof)
         {
             exhausted_ = true;
-            co_return {error::eof, std::span<const_buffer>{}};
+            co_return {capy::error::eof, std::span<capy::const_buffer>{}};
         }
 
         if (ec)
-            co_return {ec, std::span<const_buffer>{}};
-        
+            co_return {ec, std::span<capy::const_buffer>{}};
+
         // Transform: add line numbers
         for (auto const& buf : bufs)  // const_buffer const&
         {
             auto const* data = static_cast<char const*>(buf.data());  // char const*
             auto size = buf.size();  // std::size_t
-            
+
             for (std::size_t i = 0; i < size; ++i)
             {
                 if (at_line_start_)
@@ -224,15 +224,15 @@ class line_numbering_transform
                     at_line_start_ = true;
             }
         }
-        
+
         // Consume from upstream
-        source_->consume(buffer_size(bufs));
-        
+        source_->consume(capy::buffer_size(bufs));
+
         // Return transformed data
         if (dest.empty() || buffer_.empty())
-            co_return {std::error_code{}, std::span<const_buffer>{}};
-        
-        dest[0] = const_buffer(buffer_.data(), buffer_.size());
+            co_return {std::error_code{}, std::span<capy::const_buffer>{}};
+
+        dest[0] = capy::const_buffer(buffer_.data(), buffer_.size());
         co_return {std::error_code{}, dest.first(1)};
     }
 };
@@ -243,22 +243,22 @@ class line_numbering_transform
 //
 //------------------------------------------------------------------------------
 
-task<std::size_t> transfer(any_buffer_source& source, any_write_sink& sink)
+capy::task<std::size_t> transfer(capy::any_buffer_source& source, capy::any_write_sink& sink)
 {
     std::size_t total = 0;  // std::size_t
-    const_buffer bufs[8];   // const_buffer[8]
-    
+    capy::const_buffer bufs[8];   // const_buffer[8]
+
     for (;;)
     {
         // ec: std::error_code, spans: std::span<const_buffer>
         auto [ec, spans] = co_await source.pull(bufs);
-        
-        if (ec == cond::eof)
+
+        if (ec == capy::cond::eof)
             break;
 
         if (ec)
             throw std::system_error(ec);
-        
+
         // Write each buffer to sink
         for (auto const& buf : spans)  // const_buffer const&
         {
@@ -268,15 +268,15 @@ task<std::size_t> transfer(any_buffer_source& source, any_write_sink& sink)
                 throw std::system_error(wec);
             total += n;
         }
-        
+
         // Consume what we read
-        source.consume(buffer_size(spans));
+        source.consume(capy::buffer_size(spans));
     }
-    
-    io_result<> eof_result = co_await sink.write_eof();
+
+    capy::io_result<> eof_result = co_await sink.write_eof();
     if (eof_result.ec)
         throw std::system_error(eof_result.ec);
-    
+
     co_return total;
 }
 
@@ -295,8 +295,8 @@ void demo_pipeline()
     std::cout << "Input:\n" << input << "\n";
     
     // Create mock source with input data
-    test::fuse f;  // test::fuse
-    test::buffer_source source(f);  // test::buffer_source
+    capy::test::fuse f;  // test::fuse
+    capy::test::buffer_source source(f);  // test::buffer_source
     source.provide(input);
     
     // Build the pipeline using type-erased buffer sources:
@@ -305,26 +305,26 @@ void demo_pipeline()
     // Stage 1: Wrap raw source as any_buffer_source.
     // Using pointer construction (&source) for reference semantics - the
     // wrapper does not take ownership, so source must outlive src.
-    any_buffer_source src{&source};  // any_buffer_source
+    capy::any_buffer_source src{&source};  // any_buffer_source
     
     // Stage 2: Uppercase transform wraps src.
     // Again using pointer construction so upper_src references upper
     // without taking ownership.
     uppercase_transform upper{src};  // uppercase_transform
-    any_buffer_source upper_src{&upper};  // any_buffer_source
+    capy::any_buffer_source upper_src{&upper};  // any_buffer_source
     
     // Stage 3: Line numbering transform wraps upper_src.
     line_numbering_transform numbered{upper_src};  // line_numbering_transform
-    any_buffer_source numbered_src{&numbered};  // any_buffer_source
+    capy::any_buffer_source numbered_src{&numbered};  // any_buffer_source
     
     // Create sink to collect output.
     // Pointer construction ensures sink outlives dst.
-    test::write_sink sink(f);  // test::write_sink
-    any_write_sink dst{&sink};  // any_write_sink
+    capy::test::write_sink sink(f);  // test::write_sink
+    capy::any_write_sink dst{&sink};  // any_write_sink
     
     // Run the pipeline
     std::size_t bytes = 0;  // std::size_t
-    test::run_blocking([&](std::size_t n) { bytes = n; })(
+    capy::test::run_blocking([&](std::size_t n) { bytes = n; })(
         transfer(numbered_src, dst));
     
     std::cout << "Output (" << bytes << " bytes):\n";
diff --git a/example/timeout-cancellation/timeout_cancellation.cpp b/example/timeout-cancellation/timeout_cancellation.cpp
index 22298c12e..b1c791bba 100644
--- a/example/timeout-cancellation/timeout_cancellation.cpp
+++ b/example/timeout-cancellation/timeout_cancellation.cpp
@@ -16,12 +16,12 @@
 #include <latch>
 #include <thread>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 // A slow operation that respects cancellation
-task<std::string> slow_fetch(int steps)
+capy::task<std::string> slow_fetch(int steps)
 {
-    auto token = co_await this_coro::stop_token;  // std::stop_token
+    auto token = co_await capy::this_coro::stop_token;  // std::stop_token
     std::string result;
     
     for (int i = 0; i < steps; ++i)
@@ -49,9 +49,9 @@ task<std::string> slow_fetch(int steps)
 }
 
 // Run with timeout (conceptual - real implementation needs timer)
-task<std::optional<std::string>> fetch_with_timeout()
+capy::task<std::optional<std::string>> fetch_with_timeout()
 {
-    auto token = co_await this_coro::stop_token;  // std::stop_token
+    auto token = co_await capy::this_coro::stop_token;  // std::stop_token
     
     try
     {
@@ -70,11 +70,11 @@ void demo_normal_completion()
 {
     std::cout << "Demo: Normal completion\n";
     
-    thread_pool pool;
+    capy::thread_pool pool;
     std::stop_source source;
     std::latch done(1);  // std::latch - wait for 1 task
-    
-    run_async(pool.get_executor(), source.get_token(),
+
+    capy::run_async(pool.get_executor(), source.get_token(),
         [&done](std::optional<std::string> result) {
             if (result)
                 std::cout << "Result: " << *result << "\n";
@@ -84,20 +84,20 @@ void demo_normal_completion()
         },
         [&done](std::exception_ptr) { done.count_down(); }
     )(fetch_with_timeout());
-    
+
     done.wait();  // Block until task completes
 }
 
 void demo_cancellation()
 {
     std::cout << "\nDemo: Cancellation after 2 steps\n";
-    
-    thread_pool pool;
+
+    capy::thread_pool pool;
     std::stop_source source;
     std::latch done(1);  // std::latch - wait for 1 task
-    
+
     // Launch the task
-    run_async(pool.get_executor(), source.get_token(),
+    capy::run_async(pool.get_executor(), source.get_token(),
         [&done](std::optional<std::string> result) {
             if (result)
                 std::cout << "Result: " << *result << "\n";
@@ -120,9 +120,9 @@ void demo_cancellation()
 }
 
 // Example: Manual stop token checking
-task<int> process_items(std::vector<int> const& items)
+capy::task<int> process_items(std::vector<int> const& items)
 {
-    auto token = co_await this_coro::stop_token;  // std::stop_token
+    auto token = co_await capy::this_coro::stop_token;  // std::stop_token
     int sum = 0;
     
     for (auto item : items)  // int
diff --git a/example/type-erased-echo/echo.cpp b/example/type-erased-echo/echo.cpp
index fe04912b4..c11b7310f 100644
--- a/example/type-erased-echo/echo.cpp
+++ b/example/type-erased-echo/echo.cpp
@@ -15,30 +15,25 @@
 
 namespace myapp {
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
-task<> echo_session(any_stream& stream)
+capy::task<> echo_session(capy::any_stream& stream)
 {
     char buffer[1024];
-    
+
     for (;;)
     {
-        // Read some data
         // ec: std::error_code, n: std::size_t
-        auto [ec, n] = co_await stream.read_some(make_buffer(buffer));
-        
-        if (ec == cond::eof)
-            co_return;  // Client closed connection
-        
-        if (ec)
-            throw std::system_error(ec);
-        
-        // Echo it back
+        auto [ec, n] = co_await stream.read_some(capy::make_buffer(buffer));
+
         // wec: std::error_code, wn: std::size_t
-        auto [wec, wn] = co_await write(stream, const_buffer(buffer, n));
-        
+        auto [wec, wn] = co_await capy::write(stream, capy::const_buffer(buffer, n));
+
+        if (ec)
+            co_return;
+
         if (wec)
-            throw std::system_error(wec);
+            co_return;
     }
 }
 
diff --git a/example/type-erased-echo/main.cpp b/example/type-erased-echo/main.cpp
index 9d096becc..9b9cb8ce4 100644
--- a/example/type-erased-echo/main.cpp
+++ b/example/type-erased-echo/main.cpp
@@ -14,19 +14,19 @@
 #include <boost/capy/test/run_blocking.hpp>
 #include <iostream>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 void test_with_mock()
 {
-    auto [a, b] = test::make_stream_pair();
+    auto [a, b] = capy::test::make_stream_pair();
     b.provide("Hello, ");
     b.provide("World!\n");
     b.close();
     
     // Using pointer construction (&a) for reference semantics - the
     // wrapper does not take ownership, so a must outlive stream.
-    any_stream stream{&a};  // any_stream
-    test::run_blocking()(myapp::echo_session(stream));
+    capy::any_stream stream{&a};  // any_stream
+    capy::test::run_blocking()(myapp::echo_session(stream));
     
     std::cout << "Echo output: " << b.data() << "\n";
 }
diff --git a/example/when-any-cancellation/when_any_cancellation.cpp b/example/when-any-cancellation/when_any_cancellation.cpp
index f11ede75e..020e45f92 100644
--- a/example/when-any-cancellation/when_any_cancellation.cpp
+++ b/example/when-any-cancellation/when_any_cancellation.cpp
@@ -31,15 +31,16 @@
 #include <latch>
 #include <string>
 #include <thread>
+#include <vector>
 
-using namespace boost::capy;
+namespace capy = boost::capy;
 
 // Simulates a data source that takes `steps` iterations to produce a result.
 // Each step checks the stop token so the task exits promptly when cancelled.
-task<std::string> fetch_from_source(
+capy::io_task<std::string> fetch_from_source(
     std::string name, int steps, int step_ms)
 {
-    auto token = co_await this_coro::stop_token;
+    auto token = co_await capy::this_coro::stop_token;
 
     for (int i = 0; i < steps; ++i)
     {
@@ -47,7 +48,7 @@ task<std::string> fetch_from_source(
         {
             std::cout << "  [" << name << "] cancelled at step "
                       << i << "/" << steps << "\n";
-            co_return name + ": cancelled";
+            co_return capy::io_result<std::string>{{}, name + ": cancelled"};
         }
 
         // Simulate work
@@ -58,32 +59,37 @@ task<std::string> fetch_from_source(
                   << (i + 1) << "/" << steps << "\n";
     }
 
-    co_return name + ": done";
+    co_return capy::io_result<std::string>{{}, name + ": done"};
 }
 
 // Race three sources — the fastest one wins, the rest get cancelled.
-task<> race_data_sources()
+capy::task<> race_data_sources()
 {
     std::cout << "=== Racing three data sources ===\n\n";
 
     // source_a: 2 steps * 20ms = fast
     // source_b: 5 steps * 20ms = medium
     // source_c: 8 steps * 20ms = slow
-    auto [winner_index, result] = co_await when_any(
+    auto result = co_await capy::when_any(
         fetch_from_source("source_a", 2, 20),
         fetch_from_source("source_b", 5, 20),
         fetch_from_source("source_c", 8, 20));
 
-    auto value = std::get<std::string>(result);
-    std::cout << "\nWinner: index=" << winner_index
-              << " value=\"" << value << "\"\n";
+    if (result.index() != 0)
+    {
+        std::visit([](auto const& v) {
+            if constexpr (!std::is_same_v<
+                std::decay_t<decltype(v)>, std::error_code>)
+                std::cout << "\nWinner: \"" << v << "\"\n";
+        }, result);
+    }
 }
 
 // A void task that loops until stopped.
 // Useful for background workers that run indefinitely.
-task<> background_worker(std::string name, int step_ms)
+capy::io_task<> background_worker(std::string name, int step_ms)
 {
-    auto token = co_await this_coro::stop_token;
+    auto token = co_await capy::this_coro::stop_token;
     int iteration = 0;
 
     while (!token.stop_requested())
@@ -96,64 +102,51 @@ task<> background_worker(std::string name, int step_ms)
 
     std::cout << "  [" << name << "] stopped after "
               << iteration << " iterations\n";
+    co_return capy::io_result<>{};
 }
 
 // A task that finishes after a fixed delay (acts as a timeout).
-task<> timeout(int ms)
+capy::io_task<> timeout(int ms)
 {
     std::this_thread::sleep_for(
         std::chrono::milliseconds(ms));
     std::cout << "  [timeout] expired after " << ms << "ms\n";
-    co_return;
-}
-
-// Use when_any with a timeout to bound the lifetime of a background worker.
-// With void tasks, the variadic overload returns pair<size_t, variant<monostate>>.
-// We only need the winner index to know which task completed first.
-task<> timeout_a_worker()
-{
-    std::cout << "\n=== Timeout a background worker ===\n\n";
-
-    auto [winner, _] = co_await when_any(
-        background_worker("worker", 30),
-        timeout(100));
-
-    if (winner == 1)
-        std::cout << "\nTimeout fired — worker was cancelled\n";
-    else
-        std::cout << "\nWorker finished before timeout\n";
+    co_return capy::io_result<>{};
 }
 
-// Race a vector of tasks (homogeneous range overload).
-task<> race_vector_of_sources()
+// Race three replicas using variadic overload.
+capy::task<> race_vector_of_sources()
 {
-    std::cout << "\n=== Racing a vector of sources ===\n\n";
-
-    std::vector<task<std::string>> tasks;
-    tasks.push_back(fetch_from_source("replica_1", 6, 20));
-    tasks.push_back(fetch_from_source("replica_2", 3, 20));
-    tasks.push_back(fetch_from_source("replica_3", 5, 20));
+    std::cout << "\n=== Racing three replicas ===\n\n";
 
-    auto [winner_index, value] = co_await when_any(std::move(tasks));
+    auto result = co_await capy::when_any(
+        fetch_from_source("replica_1", 6, 20),
+        fetch_from_source("replica_2", 3, 20),
+        fetch_from_source("replica_3", 5, 20));
 
-    std::cout << "\nFastest replica: index=" << winner_index
-              << " value=\"" << value << "\"\n";
+    if (result.index() != 0)
+    {
+        std::visit([](auto const& v) {
+            if constexpr (!std::is_same_v<
+                std::decay_t<decltype(v)>, std::error_code>)
+                std::cout << "\nFastest replica: \"" << v << "\"\n";
+        }, result);
+    }
 }
 
 // Run all demos sequentially so output is readable.
-task<> run_demos()
+capy::task<> run_demos()
 {
     co_await race_data_sources();
-    co_await timeout_a_worker();
     co_await race_vector_of_sources();
 }
 
 int main()
 {
-    thread_pool pool;
+    capy::thread_pool pool;
     std::latch done(1);
 
-    run_async(pool.get_executor(),
+    capy::run_async(pool.get_executor(),
         [&done](auto&&...) { done.count_down(); },
         [&done](std::exception_ptr ep) {
             try { std::rethrow_exception(ep); }
diff --git a/extra/test_suite/CMakeLists.txt b/extra/test_suite/CMakeLists.txt
index 0f3e25b5e..748290f6a 100644
--- a/extra/test_suite/CMakeLists.txt
+++ b/extra/test_suite/CMakeLists.txt
@@ -15,13 +15,25 @@ cmake_minimum_required(VERSION 3.8...3.20)
 
 # Library target with test_suite.cpp and test_suite.hpp
 add_library(boost_capy_test_suite STATIC test_suite.cpp test_suite.hpp)
-target_include_directories(boost_capy_test_suite PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(boost_capy_test_suite PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/boost/capy/extra/test_suite>)
 target_compile_features(boost_capy_test_suite PUBLIC cxx_std_17)
 
 # Library target that links the previous and adds test_main.cpp
 add_library(boost_capy_test_suite_main STATIC test_main.cpp)
 target_link_libraries(boost_capy_test_suite_main PUBLIC boost_capy_test_suite)
-target_include_directories(boost_capy_test_suite_main PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
+target_include_directories(boost_capy_test_suite_main PUBLIC
+    $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>
+    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/boost/capy/extra/test_suite>)
 target_compile_features(boost_capy_test_suite_main PUBLIC cxx_std_17)
 
+set_target_properties(boost_capy_test_suite PROPERTIES
+    EXPORT_NAME capy_test_suite)
+set_target_properties(boost_capy_test_suite_main PROPERTIES
+    EXPORT_NAME capy_test_suite_main)
+
+add_library(Boost::capy_test_suite ALIAS boost_capy_test_suite)
+add_library(Boost::capy_test_suite_main ALIAS boost_capy_test_suite_main)
+
 include(DiscoverTests.cmake)
diff --git a/include/boost/capy.hpp b/include/boost/capy.hpp
index 5d9c99280..8af85a453 100644
--- a/include/boost/capy.hpp
+++ b/include/boost/capy.hpp
@@ -22,11 +22,14 @@
 #include <boost/capy/error.hpp>
 #include <boost/capy/io_result.hpp>
 #include <boost/capy/io_task.hpp>
+#include <boost/capy/quitter.hpp>
 #include <boost/capy/task.hpp>
 
 // Algorithms
+#include <boost/capy/delay.hpp>
 #include <boost/capy/read.hpp>
 #include <boost/capy/read_until.hpp>
+#include <boost/capy/timeout.hpp>
 #include <boost/capy/when_all.hpp>
 #include <boost/capy/when_any.hpp>
 #include <boost/capy/write.hpp>
@@ -34,15 +37,12 @@
 // Buffers
 #include <boost/capy/buffers.hpp>
 #include <boost/capy/buffers/buffer_copy.hpp>
-#include <boost/capy/buffers/buffer_pair.hpp>
 #include <boost/capy/buffers/buffer_param.hpp>
+#include <boost/capy/buffers/buffer_slice.hpp>
 #include <boost/capy/buffers/circular_dynamic_buffer.hpp>
-#include <boost/capy/buffers/consuming_buffers.hpp>
 #include <boost/capy/buffers/flat_dynamic_buffer.hpp>
 #include <boost/capy/buffers/front.hpp>
 #include <boost/capy/buffers/make_buffer.hpp>
-#include <boost/capy/buffers/slice.hpp>
-#include <boost/capy/buffers/buffer_array.hpp>
 #include <boost/capy/buffers/string_dynamic_buffer.hpp>
 #include <boost/capy/buffers/vector_dynamic_buffer.hpp>
 
@@ -61,6 +61,7 @@
 #include <boost/capy/concept/mutable_buffer_sequence.hpp>
 #include <boost/capy/concept/read_source.hpp>
 #include <boost/capy/concept/read_stream.hpp>
+#include <boost/capy/concept/slice.hpp>
 #include <boost/capy/concept/stream.hpp>
 #include <boost/capy/concept/write_sink.hpp>
 #include <boost/capy/concept/write_stream.hpp>
diff --git a/include/boost/capy/buffers.hpp b/include/boost/capy/buffers.hpp
index 36e1cc75b..9fe509754 100644
--- a/include/boost/capy/buffers.hpp
+++ b/include/boost/capy/buffers.hpp
@@ -32,30 +32,6 @@ namespace capy {
 class const_buffer;
 class mutable_buffer;
 
-//------------------------------------------------
-
-/// Tag type for customizing `buffer_size` via `tag_invoke`.
-struct size_tag {};
-
-/// Tag type for customizing slice operations via `tag_invoke`.
-struct slice_tag {};
-
-/** Constants for slice customization.
-
-    Passed to `tag_invoke` overloads to specify which portion
-    of a buffer sequence to retain.
-*/
-enum class slice_how
-{
-    /// Remove bytes from the front of the sequence.
-    remove_prefix,
-
-    /// Keep only the first N bytes.
-    keep_prefix
-};
-
-//------------------------------------------------
-
 /** A reference to a contiguous region of writable memory.
 
     Represents a pointer and size pair for a modifiable byte range.
@@ -74,11 +50,11 @@ class mutable_buffer
     /// Construct an empty buffer.
     mutable_buffer() = default;
 
-    /// Copy constructor.
+    /// Construct a copy.
     mutable_buffer(
         mutable_buffer const&) = default;
 
-    /// Copy assignment.
+    /// Assign by copying.
     mutable_buffer& operator=(
         mutable_buffer const&) = default;
 
@@ -115,39 +91,8 @@ class mutable_buffer
         n_ -= n;
         return *this;
     }
-
-    /// Slice customization point for `tag_invoke`.
-    friend
-    void
-    tag_invoke(
-        slice_tag const&,
-        mutable_buffer& b,
-        slice_how how,
-        std::size_t n) noexcept
-    {
-        b.do_slice(how, n);
-    }
-
-private:
-    void do_slice(
-        slice_how how, std::size_t n) noexcept
-    {
-        switch(how)
-        {
-        case slice_how::remove_prefix:
-            *this += n;
-            return;
-
-        case slice_how::keep_prefix:
-            if( n < n_)
-                n_ = n;
-            return;
-        }
-    }
 };
 
-//------------------------------------------------
-
 /** A reference to a contiguous region of read-only memory.
 
     Represents a pointer and size pair for a non-modifiable byte range.
@@ -166,10 +111,10 @@ class const_buffer
     /// Construct an empty buffer.
     const_buffer() = default;
 
-    /// Copy constructor.
+    /// Construct a copy.
     const_buffer(const_buffer const&) = default;
 
-    /// Copy assignment.
+    /// Assign by copying.
     const_buffer& operator=(
         const_buffer const& other) = default;
 
@@ -214,39 +159,8 @@ class const_buffer
         n_ -= n;
         return *this;
     }
-
-    /// Slice customization point for `tag_invoke`.
-    friend
-    void
-    tag_invoke(
-        slice_tag const&,
-        const_buffer& b,
-        slice_how how,
-        std::size_t n) noexcept
-    {
-        b.do_slice(how, n);
-    }
-
-private:
-    void do_slice(
-        slice_how how, std::size_t n) noexcept
-    {
-        switch(how)
-        {
-        case slice_how::remove_prefix:
-            *this += n;
-            return;
-
-        case slice_how::keep_prefix:
-            if( n < n_)
-                n_ = n;
-            return;
-        }
-    }
 };
 
-//------------------------------------------------
-
 /** Concept for sequences of read-only buffer regions.
 
     A type satisfies `ConstBufferSequence` if it represents one or more
@@ -284,8 +198,6 @@ concept MutableBufferSequence =
         std::ranges::bidirectional_range<T> &&
         std::is_convertible_v<std::ranges::range_value_t<T>, mutable_buffer>);
 
-//------------------------------------------------------------------------------
-
 /** Return an iterator to the first buffer in a sequence.
 
     Handles single buffers and ranges uniformly. For a single buffer,
@@ -342,23 +254,6 @@ constexpr struct end_mrdocs_workaround_t
     }
 } end {};
 
-//------------------------------------------------------------------------------
-
-template<ConstBufferSequence CB>
-std::size_t
-tag_invoke(
-    size_tag const&,
-    CB const& bs) noexcept
-{
-    std::size_t n = 0;
-    auto const e = end(bs);
-    for(auto it = begin(bs); it != e; ++it)
-        n += const_buffer(*it).size();
-    return n;
-}
-
-//------------------------------------------------------------------------------
-
 /** Return the total byte count across all buffers in a sequence.
 
     Sums the `size()` of each buffer in the sequence. This differs
@@ -372,12 +267,29 @@ tag_invoke(
 */
 constexpr struct buffer_size_mrdocs_workaround_t
 {
+    // GCC 13 falsely flags reads of arr_[i].n_ in detail::buffer_array
+    // when iterating here. The class uses union storage with placement
+    // new for slots 0..n_-1, so reads inside this bounded loop are
+    // well-defined, but the optimizer can't prove the loop bound and
+    // warns. The runtime cost of value-initializing all N slots is
+    // non-trivial for non-trivial value types, so we suppress instead.
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
     template<ConstBufferSequence CB>
     constexpr std::size_t operator()(
         CB const& bs) const noexcept
     {
-        return tag_invoke(size_tag{}, bs);
+        std::size_t n = 0;
+        auto const e = capy::end(bs);
+        for(auto it = capy::begin(bs); it != e; ++it)
+            n += const_buffer(*it).size();
+        return n;
     }
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
 } buffer_size {};
 
 /** Check if a buffer sequence contains no data.
@@ -387,6 +299,11 @@ constexpr struct buffer_size_mrdocs_workaround_t
 */
 constexpr struct buffer_empty_mrdocs_workaround_t
 {
+    // See note on buffer_size above — same union-storage false positive.
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
     template<ConstBufferSequence CB>
     constexpr bool operator()(
         CB const& bs) const noexcept
@@ -401,10 +318,11 @@ constexpr struct buffer_empty_mrdocs_workaround_t
         }
         return true;
     }
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
 } buffer_empty {};
 
-//-----------------------------------------------
-
 namespace detail {
 
 template<class It>
diff --git a/include/boost/capy/buffers/asio.hpp b/include/boost/capy/buffers/asio.hpp
index f581191c8..a2b8be26c 100644
--- a/include/boost/capy/buffers/asio.hpp
+++ b/include/boost/capy/buffers/asio.hpp
@@ -263,8 +263,6 @@ class buffer_sequence_adaptor
 
 } // detail
 
-//------------------------------------------------
-
 /** Adapt a capy buffer sequence for use with Asio.
 
     Returns a wrapper whose iterators dereference to
diff --git a/include/boost/capy/buffers/buffer_pair.hpp b/include/boost/capy/buffers/buffer_pair.hpp
deleted file mode 100644
index f217d040c..000000000
--- a/include/boost/capy/buffers/buffer_pair.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-//
-// Copyright (c) 2023 Vinnie Falco (vinnie.falco@gmail.com)
-//
-// Distributed under the Boost Software License, Version 1.0. (See accompanying
-// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-//
-// Official repository: https://github.com/cppalliance/capy
-//
-
-#ifndef BOOST_CAPY_BUFFERS_BUFFER_PAIR_HPP
-#define BOOST_CAPY_BUFFERS_BUFFER_PAIR_HPP
-
-#include <boost/capy/detail/config.hpp>
-#include <boost/capy/buffers.hpp>
-#include <array>
-
-namespace boost {
-namespace capy {
-
-/** A constant buffer pair
-*/
-using const_buffer_pair = std::array<const_buffer,2>;
-
-BOOST_CAPY_DECL
-void
-tag_invoke(
-    slice_tag const&,
-    const_buffer_pair& bs,
-    slice_how how,
-    std::size_t n) noexcept;
-
-/** A mutable buffer pair
-*/
-using mutable_buffer_pair = std::array<mutable_buffer,2>;
-
-BOOST_CAPY_DECL
-void
-tag_invoke(
-    slice_tag const&,
-    mutable_buffer_pair& bs,
-    slice_how how,
-    std::size_t n) noexcept;
-
-} // capy
-} // boost
-
-#endif
diff --git a/include/boost/capy/buffers/buffer_param.hpp b/include/boost/capy/buffers/buffer_param.hpp
index b92b026a5..dd2eea191 100644
--- a/include/boost/capy/buffers/buffer_param.hpp
+++ b/include/boost/capy/buffers/buffer_param.hpp
@@ -67,7 +67,7 @@ namespace capy {
     @ref max_size buffer descriptors, automatically refilling
     from the underlying sequence as buffers are consumed.
 
-    @par Usage
+    @par Example
 
     Create a `buffer_param` from any buffer sequence and use
     `data()` to get the current window of buffers. After
diff --git a/include/boost/capy/buffers/buffer_slice.hpp b/include/boost/capy/buffers/buffer_slice.hpp
new file mode 100644
index 000000000..2d83bfd0a
--- /dev/null
+++ b/include/boost/capy/buffers/buffer_slice.hpp
@@ -0,0 +1,125 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_BUFFERS_BUFFER_SLICE_HPP
+#define BOOST_CAPY_BUFFERS_BUFFER_SLICE_HPP
+
+#include <boost/capy/detail/config.hpp>
+#include <boost/capy/buffers.hpp>
+#include <boost/capy/detail/slice_impl.hpp>
+
+#include <cstddef>
+#include <limits>
+
+namespace boost {
+namespace capy {
+
+/** Return a byte-range slice of a buffer sequence.
+
+    Constructs a view over a contiguous byte range of `seq`. The
+    slice exposes its current bytes via `data()` (a buffer sequence)
+    and supports incremental consumption via `remove_prefix(n)`.
+
+    @par Return Value
+    An object of unspecified type satisfying the @ref Slice concept.
+    Bind with `auto` and operate through the concept's members. When
+    `seq` models @ref MutableBufferSequence, the returned object
+    additionally models @ref MutableSlice.
+
+    @par Lifetime
+    The returned slice is associated with `seq` as its underlying
+    buffer sequence. `seq` — and the memory referenced by its buffer
+    descriptors — must remain valid for as long as the slice, or
+    any buffer sequence obtained from its `data()`, is in use.
+    Passing a temporary buffer sequence to `buffer_slice` produces
+    a dangling slice.
+
+    The buffer sequence returned by `data()` is independent of the
+    slice object: subsequent operations on the slice (mutation,
+    copy, move, destruction) do not invalidate an already-obtained
+    `data()` view. It remains valid for as long as `seq` is valid.
+
+    Iterators and buffer descriptors obtained through `data()`
+    follow the same invalidation rules as those of `seq`.
+
+    @par Parameters
+    @li `seq` The underlying buffer sequence. Must outlive the
+        returned slice and any `data()` view obtained from it.
+    @li `offset` Number of bytes to skip from the start of `seq`.
+        Clamped to `buffer_size(seq)`.
+    @li `length` Maximum number of bytes the slice will expose,
+        starting at `offset`. Clamped to `buffer_size(seq) - offset`.
+        Defaults to the maximum value of `std::size_t`, i.e. "to end".
+
+    @par Example
+    @code
+    template< ReadStream Stream, MutableBufferSequence MB >
+    task< io_result< std::size_t > >
+    read_all( Stream& stream, MB buffers )
+    {
+        auto s = buffer_slice( buffers );
+        std::size_t const total_size = buffer_size( buffers );
+        std::size_t total = 0;
+        while( total < total_size )
+        {
+            auto [ec, n] = co_await stream.read_some( s.data() );
+            s.remove_prefix( n );
+            total += n;
+            if( ec )
+                co_return {ec, total};
+        }
+        co_return {{}, total};
+    }
+    @endcode
+
+    @see Slice, MutableSlice
+*/
+template<class BufferSequence>
+    requires MutableBufferSequence<BufferSequence>
+          || ConstBufferSequence<BufferSequence>
+auto
+buffer_slice(
+    BufferSequence const& seq,
+    std::size_t offset = 0,
+    std::size_t length =
+        (std::numeric_limits<std::size_t>::max)()) noexcept
+{
+    return detail::slice_impl<BufferSequence>(seq, offset, length);
+}
+
+/** Deleted overload that rejects rvalue arguments at compile time.
+
+    Because the returned slice's validity depends on the underlying
+    buffer sequence remaining alive, calling `buffer_slice` with a
+    temporary buffer sequence would produce an immediately dangling
+    slice. This overload makes such calls ill-formed, surfacing the
+    lifetime error at compile time rather than as runtime UB.
+
+    To slice a buffer sequence produced as a temporary, hoist it
+    into a named variable first:
+
+    @code
+    auto bufs = some_dynamic_buffer.data();   // named, lives in scope
+    auto s = buffer_slice( bufs );            // OK
+    @endcode
+*/
+template<class BufferSequence>
+    requires MutableBufferSequence<BufferSequence>
+          || ConstBufferSequence<BufferSequence>
+auto
+buffer_slice(
+    BufferSequence const&& seq,
+    std::size_t offset = 0,
+    std::size_t length =
+        (std::numeric_limits<std::size_t>::max)()) = delete;
+
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/include/boost/capy/buffers/circular_dynamic_buffer.hpp b/include/boost/capy/buffers/circular_dynamic_buffer.hpp
index 5da7410bd..0ede578d0 100644
--- a/include/boost/capy/buffers/circular_dynamic_buffer.hpp
+++ b/include/boost/capy/buffers/circular_dynamic_buffer.hpp
@@ -11,9 +11,11 @@
 #define BOOST_CAPY_BUFFERS_CIRCULAR_DYNAMIC_BUFFER_HPP
 
 #include <boost/capy/detail/config.hpp>
-#include <boost/capy/buffers/buffer_pair.hpp>
+#include <boost/capy/buffers.hpp>
 #include <boost/capy/detail/except.hpp>
 
+#include <array>
+
 namespace boost {
 namespace capy {
 
@@ -62,15 +64,22 @@ class circular_dynamic_buffer
     using is_dynamic_buffer_adapter = void;
 
     /// The ConstBufferSequence type for readable bytes.
-    using const_buffers_type = const_buffer_pair;
+    using const_buffers_type = std::array<const_buffer, 2>;
 
     /// The MutableBufferSequence type for writable bytes.
-    using mutable_buffers_type = mutable_buffer_pair;
+    using mutable_buffers_type = std::array<mutable_buffer, 2>;
 
     /// Construct an empty circular buffer with zero capacity.
     circular_dynamic_buffer() = default;
 
-    /// Copy constructor.
+    /** Construct a copy.
+
+        Copies the adapter state (position and length) but does
+        not deep-copy the backing storage. Both objects alias the
+        same external buffer.
+
+        @note The underlying storage must outlive all copies.
+    */
     circular_dynamic_buffer(
         circular_dynamic_buffer const&) = default;
 
@@ -110,7 +119,14 @@ class circular_dynamic_buffer
             detail::throw_invalid_argument();
     }
 
-    /// Copy assignment.
+    /** Assign by copying.
+
+        Copies the adapter state but does not deep-copy the
+        backing storage. Both objects alias the same external
+        buffer afterward.
+
+        @note The underlying storage must outlive all copies.
+    */
     circular_dynamic_buffer& operator=(
         circular_dynamic_buffer const&) = default;
 
diff --git a/include/boost/capy/buffers/consuming_buffers.hpp b/include/boost/capy/buffers/consuming_buffers.hpp
deleted file mode 100644
index 024625e93..000000000
--- a/include/boost/capy/buffers/consuming_buffers.hpp
+++ /dev/null
@@ -1,235 +0,0 @@
-//
-// Copyright (c) 2025 Vinnie Falco (vinnie.falco@gmail.com)
-//
-// Distributed under the Boost Software License, Version 1.0. (See accompanying
-// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-//
-// Official repository: https://github.com/cppalliance/capy
-//
-
-#ifndef BOOST_CAPY_BUFFERS_CONSUMING_BUFFERS_HPP
-#define BOOST_CAPY_BUFFERS_CONSUMING_BUFFERS_HPP
-
-#include <boost/capy/detail/config.hpp>
-#include <boost/capy/buffers.hpp>
-
-#include <cstddef>
-#include <iterator>
-#include <ranges>
-#include <type_traits>
-
-namespace boost {
-namespace capy {
-
-namespace detail {
-
-template<class T>
-struct buffer_type_for;
-
-template<MutableBufferSequence T>
-struct buffer_type_for<T>
-{
-    using type = mutable_buffer;
-};
-
-template<ConstBufferSequence T>
-    requires (!MutableBufferSequence<T>)
-struct buffer_type_for<T>
-{
-    using type = const_buffer;
-};
-
-} // namespace detail
-
-/** Wrapper for consuming a buffer sequence incrementally.
-
-    This class wraps a buffer sequence and tracks the current
-    position. It provides a `consume(n)` function that advances
-    through the sequence as bytes are processed.
-
-    Works with both mutable and const buffer sequences.
-
-    @tparam BufferSequence The buffer sequence type.
-*/
-template<class BufferSequence>
-    requires MutableBufferSequence<BufferSequence> ||
-             ConstBufferSequence<BufferSequence>
-class consuming_buffers
-{
-    using iterator_type = decltype(capy::begin(std::declval<BufferSequence const&>()));
-    using end_iterator_type = decltype(capy::end(std::declval<BufferSequence const&>()));
-    using buffer_type = typename detail::buffer_type_for<BufferSequence>::type;
-
-    BufferSequence const& bufs_;
-    iterator_type it_;
-    end_iterator_type end_;
-    std::size_t consumed_ = 0;  // Bytes consumed in current buffer
-
-public:
-    /** Construct from a buffer sequence.
-
-        @param bufs The buffer sequence to wrap.
-    */
-    explicit consuming_buffers(BufferSequence const& bufs) noexcept
-        : bufs_(bufs)
-        , it_(capy::begin(bufs))
-        , end_(capy::end(bufs))
-    {
-    }
-
-    /** Consume n bytes from the buffer sequence.
-
-        Advances the current position by n bytes, moving to the
-        next buffer when the current one is exhausted.
-
-        @param n The number of bytes to consume.
-    */
-    void consume(std::size_t n) noexcept
-    {
-        while (n > 0 && it_ != end_)
-        {
-            auto const& buf = *it_;
-            std::size_t const buf_size = buf.size();
-            std::size_t const remaining = buf_size - consumed_;
-
-            if (n < remaining)
-            {
-                // Consume part of current buffer
-                consumed_ += n;
-                n = 0;
-            }
-            else
-            {
-                // Consume rest of current buffer and move to next
-                n -= remaining;
-                consumed_ = 0;
-                ++it_;
-            }
-        }
-    }
-
-    /** Iterator for the consuming buffer sequence.
-
-        Returns buffers starting from the current position,
-        with the first buffer adjusted for consumed bytes.
-    */
-    class const_iterator
-    {
-        iterator_type it_;
-        end_iterator_type end_;
-        std::size_t consumed_;
-
-    public:
-        using iterator_category = std::bidirectional_iterator_tag;
-        using value_type = buffer_type;
-        using difference_type = std::ptrdiff_t;
-        using pointer = value_type*;
-        using reference = value_type;
-
-        // Default constructor required for forward_iterator
-        const_iterator() noexcept = default;
-
-        const_iterator(
-            iterator_type it,
-            end_iterator_type end,
-            std::size_t consumed) noexcept
-            : it_(it)
-            , end_(end)
-            , consumed_(consumed)
-        {
-        }
-
-        bool operator==(const_iterator const& other) const noexcept
-        {
-            return it_ == other.it_ && consumed_ == other.consumed_;
-        }
-
-        // != operator required for equality_comparable
-        bool operator!=(const_iterator const& other) const noexcept
-        {
-            return !(*this == other);
-        }
-
-        value_type operator*() const noexcept
-        {
-            auto const& buf = *it_;
-            if constexpr (std::is_same_v<buffer_type, mutable_buffer>)
-            {
-                return buffer_type(
-                    static_cast<char*>(buf.data()) + consumed_,
-                    buf.size() - consumed_);
-            }
-            else
-            {
-                return buffer_type(
-                    static_cast<char const*>(buf.data()) + consumed_,
-                    buf.size() - consumed_);
-            }
-        }
-
-        const_iterator& operator++() noexcept
-        {
-            consumed_ = 0;
-            ++it_;
-            return *this;
-        }
-
-        const_iterator operator++(int) noexcept
-        {
-            const_iterator tmp = *this;
-            ++*this;
-            return tmp;
-        }
-
-        const_iterator& operator--() noexcept
-        {
-            if (consumed_ == 0)
-            {
-                --it_;
-                // Set consumed_ to the size of the previous buffer
-                // This is a simplified implementation for bidirectional requirement
-                if (it_ != end_)
-                {
-                    auto const& buf = *it_;
-                    consumed_ = buf.size();
-                }
-            }
-            else
-            {
-                consumed_ = 0;
-            }
-            return *this;
-        }
-
-        const_iterator operator--(int) noexcept
-        {
-            const_iterator tmp = *this;
-            --*this;
-            return tmp;
-        }
-    };
-
-    /** Return iterator to beginning of remaining buffers.
-
-        @return Iterator pointing to the first remaining buffer,
-            adjusted for consumed bytes in the current buffer.
-    */
-    const_iterator begin() const noexcept
-    {
-        return const_iterator(it_, end_, consumed_);
-    }
-
-    /** Return iterator to end of buffer sequence.
-
-        @return End iterator.
-    */
-    const_iterator end() const noexcept
-    {
-        return const_iterator(end_, end_, 0);
-    }
-};
-
-} // namespace capy
-} // namespace boost
-
-#endif
diff --git a/include/boost/capy/buffers/flat_dynamic_buffer.hpp b/include/boost/capy/buffers/flat_dynamic_buffer.hpp
index a9c881e5e..a487b6898 100644
--- a/include/boost/capy/buffers/flat_dynamic_buffer.hpp
+++ b/include/boost/capy/buffers/flat_dynamic_buffer.hpp
@@ -89,11 +89,11 @@ class flat_dynamic_buffer
             detail::throw_invalid_argument();
     }
 
-    /// Copy constructor.
+    /// Construct a copy.
     flat_dynamic_buffer(
         flat_dynamic_buffer const&) = default;
 
-    /// Copy assignment.
+    /// Assign by copying.
     flat_dynamic_buffer& operator=(
         flat_dynamic_buffer const&) = default;
 
diff --git a/include/boost/capy/buffers/front.hpp b/include/boost/capy/buffers/front.hpp
index a7265a611..13626d3bd 100644
--- a/include/boost/capy/buffers/front.hpp
+++ b/include/boost/capy/buffers/front.hpp
@@ -20,6 +20,7 @@ namespace capy {
 */
 constexpr struct front_mrdocs_workaround_t
 {
+    /// Return the first mutable buffer, or an empty buffer.
     template<MutableBufferSequence MutableBufferSequence>
     mutable_buffer
     operator()(
@@ -31,6 +32,7 @@ constexpr struct front_mrdocs_workaround_t
         return {};
     }
 
+    /// Return the first const buffer, or an empty buffer.
     template<ConstBufferSequence ConstBufferSequence>
         requires (!MutableBufferSequence<ConstBufferSequence>)
     const_buffer
diff --git a/include/boost/capy/buffers/make_buffer.hpp b/include/boost/capy/buffers/make_buffer.hpp
index 031555fb6..791d2c353 100644
--- a/include/boost/capy/buffers/make_buffer.hpp
+++ b/include/boost/capy/buffers/make_buffer.hpp
@@ -22,10 +22,8 @@
 #include <type_traits>
 #include <vector>
 
-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable: 4459)
-#endif
+BOOST_CAPY_MSVC_WARNING_PUSH
+BOOST_CAPY_MSVC_WARNING_DISABLE(4459)
 
 namespace boost {
 namespace capy {
@@ -182,9 +180,7 @@ make_buffer(
         N * sizeof(T) < max_size ? N * sizeof(T) : max_size);
 }
 
-//------------------------------------------------
 // std::array
-//------------------------------------------------
 
 /** Return a buffer from a std::array.
 */
@@ -244,9 +240,7 @@ make_buffer(
             ? data.size() * sizeof(T) : max_size);
 }
 
-//------------------------------------------------
 // std::vector
-//------------------------------------------------
 
 /** Return a buffer from a std::vector.
 */
@@ -308,9 +302,7 @@ make_buffer(
             ? data.size() * sizeof(T) : max_size);
 }
 
-//------------------------------------------------
 // std::basic_string
-//------------------------------------------------
 
 /** Return a buffer from a std::basic_string.
 */
@@ -368,9 +360,7 @@ make_buffer(
             ? data.size() * sizeof(CharT) : max_size);
 }
 
-//------------------------------------------------
 // std::basic_string_view
-//------------------------------------------------
 
 /** Return a buffer from a std::basic_string_view.
 */
@@ -400,9 +390,7 @@ make_buffer(
             ? data.size() * sizeof(CharT) : max_size);
 }
 
-//------------------------------------------------
 // std::span
-//------------------------------------------------
 
 /** Return a buffer from a mutable std::span.
 */
@@ -458,9 +446,7 @@ make_buffer(
         data.size() < max_size ? data.size() : max_size);
 }
 
-//------------------------------------------------
 // Contiguous ranges
-//------------------------------------------------
 
 namespace detail {
 
@@ -543,8 +529,6 @@ make_buffer(
 } // capy
 } // boost
 
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
+BOOST_CAPY_MSVC_WARNING_POP
 
 #endif
diff --git a/include/boost/capy/buffers/slice.hpp b/include/boost/capy/buffers/slice.hpp
deleted file mode 100644
index a9b91fc71..000000000
--- a/include/boost/capy/buffers/slice.hpp
+++ /dev/null
@@ -1,535 +0,0 @@
-//
-// Copyright (c) 2025 Vinnie Falco (vinnie.falco@gmail.com)
-//
-// Distributed under the Boost Software License, Version 1.0. (See accompanying
-// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-//
-// Official repository: https://github.com/cppalliance/capy
-//
-
-#ifndef BOOST_CAPY_BUFFERS_SLICE_HPP
-#define BOOST_CAPY_BUFFERS_SLICE_HPP
-
-#include <boost/capy/detail/config.hpp>
-#include <boost/capy/buffers.hpp>
-#include <array>
-#include <cassert>
-#include <iterator>
-#include <type_traits>
-
-namespace boost {
-namespace capy {
-
-template<class T> class slice_of;
-
-namespace detail {
-
-template<class T, class = void>
-struct has_tag_invoke : std::false_type {};
-
-template<class T>
-struct has_tag_invoke<T, decltype(tag_invoke(
-    std::declval<slice_tag const&>(),
-    std::declval<T&>(),
-    std::declval<slice_how>(),
-    std::declval<std::size_t>()))>
-    : std::true_type {};
-
-} // detail
-
-/** Alias for the type representing a slice of T
-*/
-template<class T>
-using slice_type = std::conditional_t<
-    detail::has_tag_invoke<T>::value,
-    T, slice_of<T>>;
-
-//------------------------------------------------
-
-/** A wrapper enabling a buffer sequence to be consumed
-*/
-template<ConstBufferSequence BufferSequence>
-class slice_of<BufferSequence>
-{
-    static_assert(!std::is_const_v<BufferSequence>,
-        "BufferSequence can't be const");
-
-    static_assert(!std::is_reference_v<BufferSequence>,
-        "BufferSequence can't be a reference");
-
-    using iter_type = decltype(
-        std::declval<BufferSequence const&>().begin());
-
-    using difference_type =
-        typename std::iterator_traits<iter_type>::difference_type;
-
-    BufferSequence bs_;
-    difference_type begin_ = 0; // index of first buffer in sequence
-    difference_type end_ = 0;   // 1 + index of last buffer in sequence
-    std::size_t len_ = 0;       // length of bs_
-    std::size_t size_ = 0;      // total bytes
-    std::size_t prefix_ = 0;    // used prefix bytes
-    std::size_t suffix_ = 0;    // used suffix bytes
-
-public:
-    /** The type of values returned by iterators
-    */
-    using value_type = std::conditional_t<
-        MutableBufferSequence<BufferSequence>,
-        mutable_buffer, const_buffer>;
-
-    /** The type of returned iterators
-    */
-    class const_iterator
-    {
-        iter_type it_;
-        // VFALCO we could just point back to
-        // the original sequence to save size
-        std::size_t prefix_ = 0;
-        std::size_t suffix_ = 0;
-        std::size_t i_ = 0;
-        std::size_t n_ = 0;
-
-        friend class slice_of<BufferSequence>;
-
-        const_iterator(
-            iter_type it,
-            std::size_t prefix__,
-            std::size_t suffix__,
-            std::size_t i,
-            std::size_t n) noexcept
-            : it_(it)
-            , prefix_(prefix__)
-            , suffix_(suffix__)
-            , i_(i)
-            , n_(n)
-        {
-            // n_ is the index of the end iterator
-        }
-
-    public:
-        using value_type = typename slice_of::value_type;
-        using reference = value_type;
-        using pointer = void;
-        using difference_type = std::ptrdiff_t;
-        using iterator_category =
-            std::bidirectional_iterator_tag;
-        using iterator_concept = std::bidirectional_iterator_tag;
-
-        const_iterator() = default;
-
-        bool
-        operator==(
-            const_iterator const& other) const noexcept
-        {
-            return
-                it_     == other.it_ &&
-                prefix_ == other.prefix_ &&
-                suffix_ == other.suffix_ &&
-                i_      == other.i_ &&
-                n_      == other.n_;
-        }
-
-        bool
-        operator!=(
-            const_iterator const& other) const noexcept
-        {
-            return !(*this == other);
-        }
-
-        reference
-        operator*() const noexcept
-        {
-            value_type v = *it_;
-            using P = std::conditional_t<
-                MutableBufferSequence<BufferSequence>,
-                char*, char const*>;
-            auto p = reinterpret_cast<P>(v.data());
-            auto n = v.size();
-            if(i_ == 0)
-            {
-                p += prefix_;
-                n -= prefix_;
-            }
-            if(i_ == n_ - 1)
-                n -= suffix_;
-            return value_type(p, n);
-        }
-
-        const_iterator&
-        operator++() noexcept
-        {
-            BOOST_CAPY_ASSERT(i_ < n_);
-            ++it_;
-            ++i_;
-            return *this;
-        }
-
-        const_iterator
-        operator++(int) noexcept
-        {
-            auto temp = *this;
-            ++(*this);
-            return temp;
-        }
-
-        const_iterator&
-        operator--() noexcept
-        {
-            BOOST_CAPY_ASSERT(i_ > 0);
-            --it_;
-            --i_;
-            return *this;
-        }
-
-        const_iterator
-        operator--(int) noexcept
-        {
-            auto temp = *this;
-            --(*this);
-            return temp;
-        }
-    };
-
-    /** Constructor
-    */
-    slice_of() = default;
-
-    /** Constructor
-    */
-    slice_of(
-        BufferSequence const& bs)
-        : bs_(bs)
-    {
-        iter_type it = capy::begin(bs_);
-        iter_type eit = capy::end(bs_);
-        begin_ = 0;
-        end_ = std::distance(it, eit);
-        while(it != eit)
-        {
-            value_type b(*it);
-            size_ += b.size();
-            ++len_;
-            ++it;
-        }
-    }
-
-    /** Return an iterator to the beginning of the sequence
-    */
-    const_iterator
-    begin() const noexcept
-    {
-        return const_iterator(
-            begin_iter_impl(), prefix_, suffix_, 0, len_);
-    }
-
-    /** Return an iterator to the end of the sequence
-    */
-    const_iterator
-    end() const noexcept
-    {
-        return const_iterator(
-            end_iter_impl(), prefix_, suffix_, len_, len_);
-    }
-
-    friend
-    void
-    tag_invoke(
-        slice_tag const&,
-        slice_of<BufferSequence>& bs,
-        slice_how how,
-        std::size_t n)
-    {
-        bs.slice_impl(how, n);
-    }
-
-private:
-    iter_type
-    begin_iter_impl() const noexcept
-    {
-        iter_type it = capy::begin(bs_);
-        std::advance(it, begin_);
-        return it;
-    }
-
-    iter_type
-    end_iter_impl() const noexcept
-    {
-        iter_type it = capy::begin(bs_);
-        std::advance(it, end_);
-        return it;
-    }
-
-    void
-    remove_prefix_impl(
-        std::size_t n)
-    {
-        if(n > size_)
-            n = size_;
-
-        // nice hack to simplify the loop (M. Nejati)
-        n += prefix_;
-        size_ += prefix_;
-        prefix_ = 0;
-
-        iter_type it = begin_iter_impl();
-
-        while(n > 0 && begin_ != end_)
-        {
-            value_type b = *it;
-            if(n < b.size())
-            {
-                prefix_ = n;
-                size_ -= n;
-                break;
-            }
-            n -= b.size();
-            size_ -= b.size();
-            ++begin_;
-            ++it;
-            --len_;
-        }
-    }
-
-    void
-    remove_suffix_impl(
-        std::size_t n)
-    {
-        if(size_ == 0)
-        {
-            BOOST_CAPY_ASSERT(begin_ == end_);
-            return;
-        }
-        BOOST_CAPY_ASSERT(begin_ != end_);
-
-        if(n > size_)
-            n = size_;
-
-        n += suffix_;
-        size_ += suffix_;
-        suffix_ = 0;
-
-        iter_type bit = begin_iter_impl();
-        iter_type it = end_iter_impl();
-        it--;
-
-        while(it != bit)
-        {
-            value_type b = *it;
-            if(n < b.size())
-            {
-                suffix_ = n;
-                size_ -= n;
-                return;
-            }
-            n -= b.size();
-            size_ -= b.size();
-            --it;
-            --end_;
-            --len_;
-        }
-        value_type b = *it;
-        auto m = b.size() - prefix_;
-        if(n < m)
-        {
-            suffix_ = n;
-            size_ -= n;
-            return;
-        }
-        end_ = begin_;
-        len_ = 0;
-        size_ = 0;
-    }
-
-    void
-    keep_prefix_impl(
-        std::size_t n)
-    {
-        if(n >= size_)
-            return;
-        if(n == 0)
-        {
-            end_ = begin_;
-            len_ = 0;
-            size_ = 0;
-            return;
-        }
-        remove_suffix_impl(size_ - n);
-    }
-
-    void
-    keep_suffix_impl(
-        std::size_t n)
-    {
-        if(n >= size_)
-            return;
-        if(n == 0)
-        {
-            begin_ = end_;
-            len_ = 0;
-            size_ = 0;
-            return;
-        }
-        remove_prefix_impl(size_ - n);
-    }
-
-    void
-    slice_impl(
-        slice_how how,
-        std::size_t n)
-    {
-        switch(how)
-        {
-        case slice_how::remove_prefix:
-        {
-            remove_prefix_impl(n);
-            break;
-        }
-        case slice_how::keep_prefix:
-        {
-            keep_prefix_impl(n);
-            break;
-        }
-        }
-    }
-};
-
-//------------------------------------------------
-
-// in-place modify  return value
-// -----------------------------
-// keep_prefix*     prefix
-// keep_suffix      suffix
-// remove_prefix*   sans_prefix
-// remove_suffix    sans_suffix
-
-/** Remove all but the first `n` bytes from a buffer sequence
-*/
-constexpr struct keep_prefix_mrdocs_workaround_t
-{
-    template<ConstBufferSequence BufferSequence>
-        requires detail::has_tag_invoke<BufferSequence>::value
-    void operator()(
-        BufferSequence& bs,
-        std::size_t n) const
-    {
-        tag_invoke(slice_tag{}, bs, slice_how::keep_prefix, n);
-    }
-} const keep_prefix{};
-
-/** Remove all but the last `n` bytes from a buffer sequence
-*/
-constexpr struct keep_suffix_mrdocs_workaround_t
-{
-    template<ConstBufferSequence BufferSequence>
-        requires detail::has_tag_invoke<BufferSequence>::value
-    void operator()(
-        BufferSequence& bs,
-        std::size_t n) const
-    {
-        auto n0 = buffer_size(bs);
-        if(n < n0)
-            tag_invoke(slice_tag{}, bs, slice_how::remove_prefix, n0 - n);
-    }
-} const keep_suffix{};
-
-/** Remove `n` bytes from the beginning of a buffer sequence
-*/
-constexpr struct remove_prefix_mrdocs_workaround_t
-{
-    template<ConstBufferSequence BufferSequence>
-        requires detail::has_tag_invoke<BufferSequence>::value
-    void operator()(
-        BufferSequence& bs,
-        std::size_t n) const
-    {
-        tag_invoke(slice_tag{}, bs, slice_how::remove_prefix, n);
-    }
-} const remove_prefix{};
-
-/** Remove `n` bytes from the end of a buffer sequence
-*/
-constexpr struct remove_suffix_mrdocs_workaround_t
-{
-    template<ConstBufferSequence BufferSequence>
-        requires detail::has_tag_invoke<BufferSequence>::value
-    void operator()(
-        BufferSequence& bs,
-        std::size_t n) const
-    {
-        auto n0 = buffer_size(bs);
-        if(n > 0)
-        {
-            if( n > n0)
-                n = n0;
-            tag_invoke(slice_tag{}, bs, slice_how::keep_prefix, n0 - n);
-        }
-    }
-} const remove_suffix{};
-
-//------------------------------------------------
-
-/** Return a sequence representing the first `n` bytes of a buffer sequence
-*/
-constexpr struct prefix_mrdocs_workaround_t
-{
-    template<ConstBufferSequence BufferSequence>
-    slice_type<BufferSequence> operator()(
-        BufferSequence const& bs,
-        std::size_t n) const noexcept
-    {
-        slice_type<BufferSequence> result(bs);
-        keep_prefix(result, n);
-        return result;
-    }
-} prefix{};
-
-/** Return a sequence representing the last `n` bytes of a buffer sequence
-*/
-constexpr struct suffix_mrdocs_workaround_t
-{
-    template<ConstBufferSequence BufferSequence>
-    slice_type<BufferSequence> operator()(
-        BufferSequence const& bs,
-        std::size_t n) const noexcept
-    {
-        slice_type<BufferSequence> result(bs);
-        keep_suffix(result, n);
-        return result;
-    }
-} suffix{};
-
-/** Return a sequence representing all but the first `n` bytes of a buffer sequence
-*/
-constexpr struct sans_prefix_mrdocs_workaround_t
-{
-    template<ConstBufferSequence BufferSequence>
-    slice_type<BufferSequence> operator()(
-        BufferSequence const& bs,
-        std::size_t n) const noexcept
-    {
-        slice_type<BufferSequence> result(bs);
-        remove_prefix(result, n);
-        return result;
-    }
-} sans_prefix{};
-
-/** Return a sequence representing all but the last `n` bytes of a buffer sequence
-*/
-constexpr struct sans_suffix_mrdocs_workaround_t
-{
-    template<ConstBufferSequence BufferSequence>
-    slice_type<BufferSequence> operator()(
-        BufferSequence const& bs,
-        std::size_t n) const noexcept
-    {
-        slice_type<BufferSequence> result(bs);
-        remove_suffix(result, n);
-        return result;
-    }
-} sans_suffix{};
-
-} // capy
-} // boost
-
-#endif
diff --git a/include/boost/capy/buffers/string_dynamic_buffer.hpp b/include/boost/capy/buffers/string_dynamic_buffer.hpp
index 9f33c31ce..0b98adb35 100644
--- a/include/boost/capy/buffers/string_dynamic_buffer.hpp
+++ b/include/boost/capy/buffers/string_dynamic_buffer.hpp
@@ -18,7 +18,38 @@
 namespace boost {
 namespace capy {
 
-/** A dynamic buffer using an underlying string
+/** A dynamic buffer backed by a `std::basic_string`.
+
+    This adapter wraps an externally-owned string and
+    exposes it through the @ref DynamicBuffer interface.
+    Readable bytes occupy the front of the string; writable
+    bytes are appended by `prepare` and made readable by
+    `commit`.
+
+    @note The wrapped string must outlive this adapter.
+        Calls to `prepare`, `commit`, and `consume`
+        invalidate previously returned buffer views.
+
+    @par Thread Safety
+    Distinct objects: Safe.
+    Shared objects: Unsafe.
+
+    @par Example
+    @code
+    std::string s;
+    auto buf = dynamic_buffer( s, 4096 );
+    auto mb = buf.prepare( 100 );
+    // fill mb with data...
+    buf.commit( 100 );
+    // buf.data() now has 100 readable bytes
+    buf.consume( 50 );
+    @endcode
+
+    @tparam CharT The character type.
+    @tparam Traits The character traits type.
+    @tparam Allocator The allocator type.
+
+    @see DynamicBuffer, string_dynamic_buffer, dynamic_buffer
 */
 template<
     class CharT,
@@ -34,16 +65,23 @@ class basic_string_dynamic_buffer
     std::size_t out_size_ = 0;
 
 public:
+    /// Indicates this is a DynamicBuffer adapter over external storage.
     using is_dynamic_buffer_adapter = void;
+
+    /// The underlying string type.
     using string_type = std::basic_string<
         CharT, Traits, Allocator>;
+
+    /// The ConstBufferSequence type for readable bytes.
     using const_buffers_type = const_buffer;
+
+    /// The MutableBufferSequence type for writable bytes.
     using mutable_buffers_type = mutable_buffer;
 
+    /// Destroy the buffer.
     ~basic_string_dynamic_buffer() = default;
 
-    /** Constructor.
-    */
+    /// Construct by moving from another buffer.
     basic_string_dynamic_buffer(
         basic_string_dynamic_buffer&& other) noexcept
         : s_(other.s_)
@@ -54,7 +92,12 @@ class basic_string_dynamic_buffer
         other.s_ = nullptr;
     }
 
-    /** Constructor.
+    /** Construct from an existing string.
+
+        @param s Pointer to the string to wrap. Must
+            remain valid for the lifetime of this object.
+        @param max_size Optional upper bound on the number
+            of bytes the buffer may hold.
     */
     explicit
     basic_string_dynamic_buffer(
@@ -72,23 +115,25 @@ class basic_string_dynamic_buffer
         in_size_ = s_->size();
     }
 
-    /** Assignment.
-    */
+    /// Copy assignment is deleted.
     basic_string_dynamic_buffer& operator=(
         basic_string_dynamic_buffer const&) = delete;
 
+    /// Return the number of readable bytes.
     std::size_t
     size() const noexcept
     {
         return in_size_;
     }
 
+    /// Return the maximum number of bytes the buffer can hold.
     std::size_t
     max_size() const noexcept
     {
         return max_size_;
     }
 
+    /// Return the number of writable bytes without reallocation.
     std::size_t
     capacity() const noexcept
     {
@@ -97,6 +142,7 @@ class basic_string_dynamic_buffer
         return max_size_ - in_size_;
     }
 
+    /// Return a buffer sequence representing the readable bytes.
     const_buffers_type
     data() const noexcept
     {
@@ -104,6 +150,18 @@ class basic_string_dynamic_buffer
             s_->data(), in_size_);
     }
 
+    /** Prepare writable space of at least `n` bytes.
+
+        Invalidates iterators and references returned by
+        previous calls to `data` and `prepare`.
+
+        @throws std::invalid_argument if `n` exceeds
+            available space.
+
+        @param n The number of bytes to prepare.
+
+        @return A mutable buffer of exactly `n` bytes.
+    */
     mutable_buffers_type
     prepare(std::size_t n)
     {
@@ -118,6 +176,14 @@ class basic_string_dynamic_buffer
             &(*s_)[in_size_], out_size_);
     }
 
+    /** Move bytes from the writable to the readable area.
+
+        Invalidates iterators and references returned by
+        previous calls to `data` and `prepare`.
+
+        @param n The number of bytes to commit. Clamped
+            to the size of the writable area.
+    */
     void commit(std::size_t n) noexcept
     {
         if(n < out_size_)
@@ -128,6 +194,14 @@ class basic_string_dynamic_buffer
         s_->resize(in_size_);
     }
 
+    /** Remove bytes from the beginning of the readable area.
+
+        Invalidates iterators and references returned by
+        previous calls to `data` and `prepare`.
+
+        @param n The number of bytes to consume. Clamped
+            to the number of readable bytes.
+    */
     void consume(std::size_t n) noexcept
     {
         if(n < in_size_)
@@ -144,6 +218,7 @@ class basic_string_dynamic_buffer
     }
 };
 
+/// A dynamic buffer using `std::string`.
 using string_dynamic_buffer = basic_string_dynamic_buffer<char>;
 
 /** Create a dynamic buffer from a string.
diff --git a/include/boost/capy/buffers/vector_dynamic_buffer.hpp b/include/boost/capy/buffers/vector_dynamic_buffer.hpp
index 9ef97e914..05c8366cd 100644
--- a/include/boost/capy/buffers/vector_dynamic_buffer.hpp
+++ b/include/boost/capy/buffers/vector_dynamic_buffer.hpp
@@ -81,9 +81,10 @@ class basic_vector_dynamic_buffer
     /// The MutableBufferSequence type for writable bytes.
     using mutable_buffers_type = mutable_buffer;
 
+    /// Destroy the buffer.
     ~basic_vector_dynamic_buffer() = default;
 
-    /** Move constructor.
+    /** Construct by moving.
     */
     basic_vector_dynamic_buffer(
         basic_vector_dynamic_buffer&& other) noexcept
diff --git a/include/boost/capy/concept/buffer_archetype.hpp b/include/boost/capy/concept/buffer_archetype.hpp
index 2423eec3d..adc2eaea7 100644
--- a/include/boost/capy/concept/buffer_archetype.hpp
+++ b/include/boost/capy/concept/buffer_archetype.hpp
@@ -40,18 +40,17 @@ struct const_buffer_archetype_
     const_buffer_archetype_& operator=(const_buffer_archetype_ const&) = default;
     const_buffer_archetype_& operator=(const_buffer_archetype_&&) = default;
 
+    /// Convert to const_buffer.
     operator const_buffer() const noexcept { return {}; }
 };
 
 #ifdef __clang__
-// workaround: archetype crashes clang
 using const_buffer_archetype = const_buffer;
 #else
+/// Alias for the const buffer archetype type.
 using const_buffer_archetype = const_buffer_archetype_;
 #endif
 
-//------------------------------------------------
-
 /** Archetype for MutableBufferSequence concept checking.
 
     This type satisfies @ref MutableBufferSequence but cannot be
@@ -76,14 +75,17 @@ struct mutable_buffer_archetype_
     mutable_buffer_archetype_& operator=(mutable_buffer_archetype_ const&) = default;
     mutable_buffer_archetype_& operator=(mutable_buffer_archetype_&&) = default;
 
+    /// Convert to mutable_buffer.
     operator mutable_buffer() const noexcept { return {}; }
+
+    /// Convert to const_buffer.
     operator const_buffer() const noexcept { return {}; }
 };
 
 #ifdef __clang__
-// workaround: archetype crashes clang
 using mutable_buffer_archetype = mutable_buffer;
 #else
+/// Alias for the mutable buffer archetype type.
 using mutable_buffer_archetype = mutable_buffer_archetype_;
 #endif
 
diff --git a/include/boost/capy/concept/buffer_source.hpp b/include/boost/capy/concept/buffer_source.hpp
index e94efda2d..5ec0cc522 100644
--- a/include/boost/capy/concept/buffer_source.hpp
+++ b/include/boost/capy/concept/buffer_source.hpp
@@ -26,9 +26,8 @@ namespace capy {
 /** Concept for types that produce buffer data asynchronously.
 
     A type satisfies `BufferSource` if it provides a `pull` member function
-    that fills a caller-provided span of buffer descriptors and is an
-    @ref IoAwaitable whose return value decomposes to
-    `(error_code,std::span<const_buffer>)`, plus a `consume` member function
+    that fills a caller-provided span of buffer descriptors and
+    await-returns `(error_code, std::span<const_buffer>)`, plus a `consume` member function
     to indicate how many bytes were used.
 
     Use this concept when you need to produce data asynchronously for
diff --git a/include/boost/capy/concept/dynamic_buffer.hpp b/include/boost/capy/concept/dynamic_buffer.hpp
index 68425a114..a6b40a833 100644
--- a/include/boost/capy/concept/dynamic_buffer.hpp
+++ b/include/boost/capy/concept/dynamic_buffer.hpp
@@ -26,7 +26,7 @@
        - MUST be passed by lvalue reference to preserve state
        - Passing as rvalue loses bookkeeping on coroutine suspend
 
-    2. WRAPPER ADAPTERS (e.g., string_buffers)
+    2. WRAPPER ADAPTERS (e.g., string_dynamic_buffer)
        - Reference external storage (std::string, std::vector)
        - Safe to pass as rvalues; external object retains data
        - Define `using is_dynamic_buffer_adapter = void;`
@@ -67,7 +67,7 @@ namespace capy {
     - **Value types** (e.g., `flat_dynamic_buffer`) store bookkeeping
       internally. Passing as rvalue to a coroutine loses state on suspend.
 
-    - **Wrapper adapters** (e.g., `string_buffers`) reference external
+    - **Wrapper adapters** (e.g., `string_dynamic_buffer`) reference external
       storage and are safe as rvalues since the external object persists.
 
     @par Conforming Signatures
@@ -125,7 +125,7 @@ concept DynamicBuffer =
 
     The distinction exists because some buffer types (like `flat_dynamic_buffer`)
     store bookkeeping internally that would be lost if passed by rvalue,
-    while adapters (like `string_buffers`) update external storage directly.
+    while adapters (like `string_dynamic_buffer`) update external storage directly.
 
     @par Conforming Signatures
     For coroutine functions, use a forwarding reference:
@@ -141,17 +141,17 @@ concept DynamicBuffer =
     // WRONG: lvalue ref rejects valid rvalue adapters
     void bad1( DynamicBufferParam auto& buffers );
     bad1( fb );                    // OK
-    bad1( string_buffers( s ) );   // compile error, but should work
+    bad1( string_dynamic_buffer( &s ) );   // compile error, but should work
 
     // WRONG: const ref deduces non-reference, rejects non-adapters
     void bad2( DynamicBufferParam auto const& buffers );
     bad2( fb );                    // compile error, but should work
-    bad2( string_buffers( s ) );   // OK (adapter only)
+    bad2( string_dynamic_buffer( &s ) );   // OK (adapter only)
 
     // CORRECT: forwarding ref enables proper checking
     void good( DynamicBufferParam auto&& buffers );
     good( fb );                    // OK: lvalue
-    good( string_buffers( s ) );   // OK: adapter rvalue
+    good( string_dynamic_buffer( &s ) );   // OK: adapter rvalue
     good( flat_dynamic_buffer( storage ) );  // compile error: non-adapter rvalue
     @endcode
 
diff --git a/include/boost/capy/concept/executor.hpp b/include/boost/capy/concept/executor.hpp
index dc9d1a209..d7582d9ff 100644
--- a/include/boost/capy/concept/executor.hpp
+++ b/include/boost/capy/concept/executor.hpp
@@ -11,6 +11,7 @@
 #define BOOST_CAPY_CONCEPT_EXECUTOR_HPP
 
 #include <boost/capy/detail/config.hpp>
+#include <boost/capy/continuation.hpp>
 
 #include <concepts>
 #include <coroutine>
@@ -39,13 +40,13 @@ class execution_context;
     @par Syntactic Requirements
 
     @li `E` must be nothrow copy and move constructible
-    @li `e1 == e2` must return a type convertible to `bool`, `noexcept`
-    @li `e.context()` must return an lvalue reference to a type derived
+    @li `ce == ce2` must return a type convertible to `bool`, `noexcept`
+    @li `ce.context()` must return an lvalue reference to a type derived
         from `execution_context`, `noexcept`
-    @li `e.on_work_started()` must be valid and `noexcept`
-    @li `e.on_work_finished()` must be valid and `noexcept`
-    @li `e.dispatch(h)` must return `std::coroutine_handle<>`
-    @li `e.post(h)` must be valid
+    @li `ce.on_work_started()` must be valid and `noexcept`
+    @li `ce.on_work_finished()` must be valid and `noexcept`
+    @li `ce.dispatch(c)` must return `std::coroutine_handle<>`
+    @li `ce.post(c)` must be valid
 
     @par Semantic Requirements
 
@@ -75,9 +76,9 @@ class execution_context;
     resumed coroutine.
 
     @li If the executor determines it is safe to resume inline
-        (e.g., already on the correct thread), returns `h` for
+        (e.g., already on the correct thread), returns `c.h` for
         the caller to use in symmetric transfer
-    @li Otherwise, posts the coroutine for later execution and
+    @li Otherwise, posts the continuation for later execution and
         returns `std::noop_coroutine()`
     @li The caller is responsible for using the returned handle
         appropriately: returning it from `await_suspend` for
@@ -88,11 +89,11 @@ class execution_context;
 
     @code
     std::coroutine_handle<> dispatch(
-        std::coroutine_handle<> h ) const
+        continuation& c ) const
     {
         if( ctx_.is_running_on_this_thread() )
-            return h;              // symmetric transfer
-        post( h );
+            return c.h;            // symmetric transfer
+        post( c );
         return std::noop_coroutine();
     }
     @endcode
@@ -102,6 +103,32 @@ class execution_context;
     @li Never blocks the caller
     @li The coroutine executes on the executor's associated context
 
+    @par Continuation Lifetime
+
+    Both `dispatch` and `post` operate on the caller's
+    continuation object by reference. The continuation must
+    remain at a stable address and must not be moved or
+    destroyed until the executor has dequeued and resumed it.
+    Destroying or moving a continuation while it is linked
+    into an executor's queue is undefined behavior.
+
+    When `dispatch` returns `c.h` (the inline case), the
+    continuation is not enqueued and may be reused or
+    destroyed immediately. When `dispatch` falls through to
+    `post`, the continuation is enqueued and the lifetime
+    requirement applies.
+
+    @par Frame Allocator TLS
+
+    The library propagates a frame allocator via thread-local
+    storage. When a custom executor's event loop calls
+    `.resume()` to drain its work queue, it must use
+    `safe_resume()` from `<boost/capy/ex/frame_allocator.hpp>`
+    instead of calling `h.resume()` directly. This saves and
+    restores the thread-local frame allocator around the call,
+    preventing a resumed coroutine from permanently overwriting
+    the caller's value.
+
     @par Executor Validity
 
     An executor becomes invalid when the first call to
@@ -126,8 +153,8 @@ class execution_context;
         void on_work_finished() const noexcept;
 
         std::coroutine_handle<> dispatch(
-            std::coroutine_handle<> h ) const;
-        void post( std::coroutine_handle<> h ) const;
+            continuation& c ) const;
+        void post( continuation& c ) const;
 
         bool operator==( E const& ) const noexcept;
     };
@@ -139,7 +166,7 @@ template<class E>
 concept Executor =
     std::is_nothrow_copy_constructible_v<E> &&
     std::is_nothrow_move_constructible_v<E> &&
-    requires(E& e, E const& ce, E const& ce2, std::coroutine_handle<> h) {
+    requires(E& e, E const& ce, E const& ce2, continuation c) {
         { ce == ce2 } noexcept -> std::convertible_to<bool>;
         { ce.context() } noexcept;
         requires std::is_lvalue_reference_v<decltype(ce.context())> &&
@@ -149,8 +176,8 @@ concept Executor =
         { ce.on_work_started() } noexcept;
         { ce.on_work_finished() } noexcept;
 
-        { ce.dispatch(h) } -> std::same_as<std::coroutine_handle<>>;
-        { ce.post(h) };
+        { ce.dispatch(c) } -> std::same_as<std::coroutine_handle<>>;
+        { ce.post(c) };
     };
 
 } // capy
diff --git a/include/boost/capy/concept/io_awaitable.hpp b/include/boost/capy/concept/io_awaitable.hpp
index 34715e270..a1942e372 100644
--- a/include/boost/capy/concept/io_awaitable.hpp
+++ b/include/boost/capy/concept/io_awaitable.hpp
@@ -13,6 +13,7 @@
 #include <boost/capy/detail/config.hpp>
 #include <coroutine>
 #include <boost/capy/ex/io_env.hpp>
+#include <ranges>
 
 namespace boost {
 namespace capy {
@@ -41,9 +42,9 @@ namespace capy {
     @li The awaitable should monitor `env->stop_token` and
         complete early with a cancellation error if stop is
         requested
-    @li The awaitable may use `env->allocator` for internal
+    @li The awaitable may use `env->frame_allocator` for internal
         allocations
-    @li The awaitable must propagate `env->allocator` faithfully
+    @li The awaitable must propagate `env->frame_allocator` faithfully
         to any child coroutines it creates
     @li The awaitable may return `std::noop_coroutine()` to
         indicate the operation was started asynchronously
@@ -78,16 +79,19 @@ namespace capy {
     struct my_io_op
     {
         io_env const* env_ = nullptr;
-        std::coroutine_handle<> cont_;
+        continuation cont_;
 
         auto await_suspend(
             std::coroutine_handle<> h,
             io_env const* env )
         {
             env_ = env;
-            cont_ = h;
+            cont_ = continuation{h};
             // Pass members by value; capturing this
-            // risks use-after-free in async callbacks
+            // risks use-after-free in async callbacks.
+            // When the async operation completes, resume
+            // via executor.post(cont_) or executor.dispatch(cont_)
+            // rather than calling h.resume() directly.
             start_async(
                 env_->stop_token,
                 env_->executor,
@@ -112,14 +116,27 @@ concept IoAwaitable =
         a.await_suspend(h, env);
     };
 
-namespace detail {
+/** The return type of `co_await a` for awaitable type A.
 
-/** Extract the result type from any awaitable via await_resume().
+    Given an awaitable A, yields the type returned by A::await_resume().
+
+    @tparam A The awaitable type.
 */
 template<typename A>
 using awaitable_result_t = decltype(std::declval<std::decay_t<A>&>().await_resume());
 
-} // namespace detail
+/** Concept for ranges of I/O awaitables.
+
+    A range satisfies `IoAwaitableRange` if it is a sized input range
+    whose value type satisfies @ref IoAwaitable.
+
+    @tparam R The range type.
+*/
+template<typename R>
+concept IoAwaitableRange =
+    std::ranges::input_range<R> &&
+    std::ranges::sized_range<R> &&
+    IoAwaitable<std::ranges::range_value_t<R>>;
 
 } // namespace capy
 } // namespace boost
diff --git a/include/boost/capy/concept/read_source.hpp b/include/boost/capy/concept/read_source.hpp
index af8fc4f23..d1d6c08e5 100644
--- a/include/boost/capy/concept/read_source.hpp
+++ b/include/boost/capy/concept/read_source.hpp
@@ -27,8 +27,8 @@ namespace capy {
 
     A type satisfies `ReadSource` if it satisfies @ref ReadStream
     and additionally provides a `read` member function that accepts
-    any @ref MutableBufferSequence and is an @ref IoAwaitable whose
-    return value decomposes to `(error_code, std::size_t)`.
+    any @ref MutableBufferSequence and await-returns
+    `(error_code, std::size_t)`.
 
     `ReadSource` refines `ReadStream`. Every `ReadSource` is a
     `ReadStream`. Algorithms constrained on `ReadStream` accept both
@@ -47,8 +47,8 @@ namespace capy {
 
     @par Semantic Requirements
 
-    The inherited `read_some` operation reads one or more bytes
-    (partial read). See @ref ReadStream.
+    The inherited `read_some` operation attempts to read up to
+    `buffer_size( buffers )` bytes (partial read). See @ref ReadStream.
 
     The `read` operation fills the entire buffer sequence. On return,
     exactly one of the following is true:
@@ -69,6 +69,15 @@ namespace capy {
     When the buffer sequence contains multiple buffers, each buffer is
     filled completely before proceeding to the next.
 
+    @par Error Reporting
+    I/O conditions arising from the underlying I/O system (EOF,
+    connection reset, broken pipe, etc.) are reported via the
+    `error_code` component of the return value. Failures in the
+    library wrapper itself (such as memory allocation failure)
+    are reported via exceptions.
+
+    @throws std::bad_alloc If coroutine frame allocation fails.
+
     @par Buffer Lifetime
 
     The caller must ensure that the memory referenced by the buffer
diff --git a/include/boost/capy/concept/read_stream.hpp b/include/boost/capy/concept/read_stream.hpp
index 8525e8e82..15180b872 100644
--- a/include/boost/capy/concept/read_stream.hpp
+++ b/include/boost/capy/concept/read_stream.hpp
@@ -26,7 +26,7 @@ namespace capy {
 
     A type satisfies `ReadStream` if it provides a `read_some`
     member function template that accepts any @ref MutableBufferSequence
-    and is an @ref IoAwaitable yielding `(error_code,std::size_t)`.
+    and await-returns `(error_code, std::size_t)`.
 
     @par Syntactic Requirements
     @li `T` must provide a `read_some` member function template
@@ -36,24 +36,31 @@ namespace capy {
         `(error_code,std::size_t)` via structured bindings
 
     @par Semantic Requirements
-    If `buffer_size( buffers ) > 0`, the operation reads one or more
-    bytes from the stream into the buffer sequence:
+    Attempts to read up to `buffer_size( buffers )` bytes from
+    the stream into the buffer sequence.
 
-    @li On success: `!ec`, and `n` is the number of bytes
-        read (at least 1).
-    @li On error: `ec`, and `n` is 0.
-    @li On end-of-file: `ec == cond::eof`, and `n` is 0.
+    If `buffer_size( buffers ) > 0`:
 
-    If `buffer_empty( buffers )` is `true`, the operation completes
-    immediately. `!ec`, and `n` is 0.
+    @li If `!ec`, then `n >= 1 && n <= buffer_size( buffers )`.
+        `n` bytes were read into the buffer sequence.
+    @li If `ec`, then `n >= 0 && n <= buffer_size( buffers )`.
+        `n` is the number of bytes read before the I/O
+        condition arose.
 
-    Buffers in the sequence are filled completely before proceeding
-    to the next buffer.
+    If `buffer_empty( buffers )` is `true`, `n` is 0. The empty
+    buffer is not itself a cause for error, but `ec` may reflect
+    the state of the stream.
 
-    @par Design Rationale
-    The requirement that `n` is 0 whenever `ec` is set follows
-    from a consistency constraint with the empty-buffer rule.
-    See the ReadStream design document for a complete derivation.
+    Buffers in the sequence are filled in order.
+
+    @par Error Reporting
+    I/O conditions arising from the underlying I/O system (EOF,
+    connection reset, broken pipe, etc.) are reported via the
+    `error_code` component of the return value. Failures in the
+    library wrapper itself (such as memory allocation failure)
+    are reported via exceptions.
+
+    @throws std::bad_alloc If coroutine frame allocation fails.
 
     @par Buffer Lifetime
     The caller must ensure that the memory referenced by `buffers`
@@ -61,21 +68,24 @@ namespace capy {
 
     @par Conforming Signatures
     @code
-    // Templated for any MutableBufferSequence
-    template< MutableBufferSequence MB >
-    IoAwaitable auto read_some( MB const& buffers );
-
     template< MutableBufferSequence MB >
-    IoAwaitable auto read_some( MB buffers );  // by-value also permitted
+    IoAwaitable auto read_some( MB buffers );
     @endcode
 
-    @warning **Coroutine Buffer Lifetime**: When implementing coroutine
-    member functions, prefer accepting buffer sequences **by value**
-    rather than by reference. Buffer sequences passed by reference may
-    become dangling if the caller's stack frame is destroyed before the
-    coroutine completes. Passing by value ensures the buffer sequence
-    is copied into the coroutine frame and remains valid across
-    suspension points.
+    @warning **Pass buffer sequences by value.** A by-value parameter
+    is copied into the coroutine frame (or the awaitable's state),
+    so the returned awaitable is self-contained and may be stored,
+    moved across threads, or wrapped into a sender without lifetime
+    concerns. A by-const-reference parameter binds to caller storage
+    and is only safe when the awaitable is consumed immediately by
+    `co_await` in the same scope; storing such an awaitable produces
+    a dangling reference.
+
+    @note Callers who want to avoid copying an expensive buffer
+    sequence (for example, a `std::vector<mutable_buffer>` with many
+    entries) can pass `std::views::all(seq)` at the call site. The
+    resulting `ref_view` satisfies the buffer-sequence concepts and
+    copies in O(1). See `doc/buffers-passing-rationale.md`.
 
     @par Example
     @code
@@ -87,9 +97,9 @@ namespace capy {
         {
             auto [ec, n] = co_await s.read_some(
                 mutable_buffer( buf + total, size - total ) );
+            total += n;
             if( ec )
                 co_return;
-            total += n;
         }
     }
     @endcode
diff --git a/include/boost/capy/concept/slice.hpp b/include/boost/capy/concept/slice.hpp
new file mode 100644
index 000000000..928d2e190
--- /dev/null
+++ b/include/boost/capy/concept/slice.hpp
@@ -0,0 +1,136 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_CONCEPT_SLICE_HPP
+#define BOOST_CAPY_CONCEPT_SLICE_HPP
+
+#include <boost/capy/detail/config.hpp>
+#include <boost/capy/buffers.hpp>
+
+#include <cstddef>
+
+namespace boost {
+namespace capy {
+
+/** Concept for types that view a byte sub-range of a buffer sequence.
+
+    A type satisfies `Slice` if it provides a view over a contiguous
+    byte range within an underlying buffer sequence, with an operation
+    to advance the start and exposure of the current bytes as a buffer
+    sequence.
+
+    @par Syntactic Requirements
+    @li `cs.data()` returns a @ref ConstBufferSequence
+    @li `s.remove_prefix(n)` advances the start of the slice by `n` bytes
+
+    @par Semantic Requirements
+    @li `s.data()` returns a buffer sequence view of the slice's current
+        live bytes.
+    @li `s.remove_prefix(n)` makes the first `min(n, total_live_bytes)`
+        bytes no longer part of the slice.
+
+    @par Lifetime
+    A `Slice` is associated, on construction, with an underlying
+    buffer sequence. The slice is valid for as long as that sequence
+    — and the memory referenced by its buffer descriptors — remains
+    valid. Operating on a slice whose underlying sequence is no
+    longer valid is undefined behavior.
+
+    The buffer sequence returned by `data()` is independent of the
+    slice object: subsequent operations on the slice
+    (`remove_prefix`, copy, move, destruction) do not invalidate
+    an already-obtained `data()` view. It remains valid for as
+    long as the slice's underlying buffer sequence is valid.
+
+    Buffer descriptors obtained through `data()` follow the same
+    invalidation rules as those of the underlying sequence.
+
+    @par Concrete Types
+    Objects modeling `Slice` are produced by the @ref buffer_slice free
+    function. The concrete type returned by `buffer_slice` is unspecified;
+    user code should bind it with `auto` and rely on this concept. When
+    the underlying buffer sequence models @ref MutableBufferSequence, the
+    returned object additionally models @ref MutableSlice.
+
+    @par Example
+    @code
+    template< WriteStream Stream, Slice S >
+    task<> write_all( Stream& stream, S s, std::size_t total )
+    {
+        std::size_t sent = 0;
+        while( sent < total )
+        {
+            auto [ec, n] = co_await stream.write_some( s.data() );
+            s.remove_prefix( n );
+            sent += n;
+            if( ec )
+                co_return;
+        }
+    }
+    @endcode
+
+    @see buffer_slice, MutableSlice, ConstBufferSequence
+*/
+template<typename T>
+concept Slice =
+    requires(T& s, T const& cs, std::size_t n)
+    {
+        { cs.data() } -> ConstBufferSequence;
+        s.remove_prefix(n);
+    };
+
+/** Concept for slices whose `data()` exposes writable buffers.
+
+    A type satisfies `MutableSlice` if it satisfies @ref Slice and
+    its `data()` member additionally returns a
+    @ref MutableBufferSequence. This is the slice analog of the
+    @ref MutableBufferSequence refinement of @ref ConstBufferSequence.
+
+    Use `MutableSlice` to constrain generic code that needs to pass
+    the slice's current bytes to a @ref ReadStream's `read_some` or
+    any other operation requiring write access through the buffer
+    sequence.
+
+    @par Producing a MutableSlice
+    @ref buffer_slice returns an object modeling `MutableSlice` when
+    the input buffer sequence models @ref MutableBufferSequence. When
+    the input is only @ref ConstBufferSequence, the returned object
+    models @ref Slice but not `MutableSlice`.
+
+    @par Example
+    @code
+    template< ReadStream Stream, MutableSlice S >
+    task<> read_all( Stream& stream, S s, std::size_t total )
+    {
+        std::size_t received = 0;
+        while( received < total )
+        {
+            auto [ec, n] = co_await stream.read_some( s.data() );
+            s.remove_prefix( n );
+            received += n;
+            if( ec )
+                co_return;
+        }
+    }
+    @endcode
+
+    @see Slice, buffer_slice, MutableBufferSequence
+*/
+template<typename T>
+concept MutableSlice =
+    Slice<T> &&
+    requires(T const& cs)
+    {
+        { cs.data() } -> MutableBufferSequence;
+    };
+
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/include/boost/capy/concept/write_sink.hpp b/include/boost/capy/concept/write_sink.hpp
index 82c0566bb..0324a3601 100644
--- a/include/boost/capy/concept/write_sink.hpp
+++ b/include/boost/capy/concept/write_sink.hpp
@@ -27,7 +27,8 @@ namespace capy {
 
     A type satisfies `WriteSink` if it satisfies @ref WriteStream
     and additionally provides `write`, `write_eof(buffers)`, and
-    `write_eof()` member functions that are @ref IoAwaitable.
+    `write_eof()` member functions that await-return
+    `(error_code, std::size_t)`.
 
     `WriteSink` refines `WriteStream`. Every `WriteSink` is a
     `WriteStream`. Algorithms constrained on `WriteStream` accept
@@ -51,8 +52,8 @@ namespace capy {
 
     @par Semantic Requirements
 
-    The inherited `write_some` operation writes one or more bytes
-    (partial write). See @ref WriteStream.
+    The inherited `write_some` operation attempts to write up to
+    `buffer_size( buffers )` bytes (partial write). See @ref WriteStream.
 
     The `write` operation consumes the entire buffer sequence:
 
@@ -76,6 +77,15 @@ namespace capy {
     After `write_eof` (either overload) returns successfully, no
     further writes or EOF signals are permitted.
 
+    @par Error Reporting
+    I/O conditions arising from the underlying I/O system (EOF,
+    connection reset, broken pipe, etc.) are reported via the
+    `error_code` component of the return value. Failures in the
+    library wrapper itself (such as memory allocation failure)
+    are reported via exceptions.
+
+    @throws std::bad_alloc If coroutine frame allocation fails.
+
     @par Buffer Lifetime
 
     The caller must ensure that the memory referenced by the buffer
diff --git a/include/boost/capy/concept/write_stream.hpp b/include/boost/capy/concept/write_stream.hpp
index 83688989d..c4bd725d2 100644
--- a/include/boost/capy/concept/write_stream.hpp
+++ b/include/boost/capy/concept/write_stream.hpp
@@ -26,8 +26,7 @@ namespace capy {
 
     A type satisfies `WriteStream` if it provides a `write_some`
     member function template that accepts any @ref ConstBufferSequence
-    and is an @ref IoAwaitable whose return value decomposes to
-    `(error_code,std::size_t)`.
+    and await-returns `(error_code, std::size_t)`.
 
     @tparam T The stream type.
 
@@ -41,18 +40,32 @@ namespace capy {
 
     @par Semantic Requirements
 
-    If `buffer_size( buffers ) > 0`, the operation writes one or more
-    bytes of data to the stream from the buffer sequence:
+    Attempts to write up to `buffer_size( buffers )` bytes from
+    the buffer sequence to the stream.
 
-    @li On success: `!ec`, and `n` is the number of bytes
-        written.
-    @li On error: `ec`, and `n` is 0.
+    If `buffer_size( buffers ) > 0`:
 
-    If `buffer_empty( buffers )` is `true`, the operation completes
-    immediately. `!ec`, and `n` is 0.
+    @li If `!ec`, then `n >= 1 && n <= buffer_size( buffers )`.
+        `n` bytes were written from the buffer sequence.
+    @li If `ec`, then `n >= 0 && n <= buffer_size( buffers )`.
+        `n` is the number of bytes written before the I/O
+        condition arose.
 
-    Buffers in the sequence are written completely before proceeding
-    to the next buffer.
+    If `buffer_empty( buffers )` is `true`, `n` is 0. The empty
+    buffer is not itself a cause for error, but `ec` may reflect
+    the state of the stream.
+
+    Buffers in the sequence are consumed in order.
+
+    @par Error Reporting
+
+    I/O conditions arising from the underlying I/O system (EOF,
+    connection reset, broken pipe, etc.) are reported via the
+    `error_code` component of the return value. Failures in the
+    library wrapper itself (such as memory allocation failure)
+    are reported via exceptions.
+
+    @throws std::bad_alloc If coroutine frame allocation fails.
 
     @par Buffer Lifetime
 
@@ -66,13 +79,20 @@ namespace capy {
     IoAwaitable auto write_some( Buffers buffers );
     @endcode
 
-    @warning **Coroutine Buffer Lifetime**: When implementing coroutine
-    member functions, prefer accepting buffer sequences **by value**
-    rather than by reference. Buffer sequences passed by reference may
-    become dangling if the caller's stack frame is destroyed before the
-    coroutine completes. Passing by value ensures the buffer sequence
-    is copied into the coroutine frame and remains valid across
-    suspension points.
+    @warning **Pass buffer sequences by value.** A by-value parameter
+    is copied into the coroutine frame (or the awaitable's state),
+    so the returned awaitable is self-contained and may be stored,
+    moved across threads, or wrapped into a sender without lifetime
+    concerns. A by-const-reference parameter binds to caller storage
+    and is only safe when the awaitable is consumed immediately by
+    `co_await` in the same scope; storing such an awaitable produces
+    a dangling reference.
+
+    @note Callers who want to avoid copying an expensive buffer
+    sequence (for example, a `std::vector<const_buffer>` with many
+    entries) can pass `std::views::all(seq)` at the call site. The
+    resulting `ref_view` satisfies the buffer-sequence concepts and
+    copies in O(1). See `doc/buffers-passing-rationale.md`.
 
     @par Example
 
@@ -85,9 +105,9 @@ namespace capy {
         {
             auto [ec, n] = co_await s.write_some(
                 const_buffer( buf + total, size - total ) );
+            total += n;
             if( ec )
                 co_return;
-            total += n;
         }
     }
     @endcode
diff --git a/include/boost/capy/cond.hpp b/include/boost/capy/cond.hpp
index b15042fdb..fd369af24 100644
--- a/include/boost/capy/cond.hpp
+++ b/include/boost/capy/cond.hpp
@@ -69,10 +69,15 @@ enum class cond
         An `error_code` compares equal to `not_found` when a
         lookup operation failed to find the requested item.
     */
-    not_found = 4
-};
+    not_found = 4,
+
+    /** Operation timed out condition.
 
-//-----------------------------------------------
+        An `error_code` compares equal to `timeout` when an
+        operation exceeded its allowed duration.
+    */
+    timeout = 5
+};
 
 } // capy
 } // boost
@@ -87,8 +92,6 @@ struct is_error_condition_enum<
 namespace boost {
 namespace capy {
 
-//-----------------------------------------------
-
 namespace detail {
 
 struct BOOST_CAPY_SYMBOL_VISIBLE
@@ -109,8 +112,7 @@ BOOST_CAPY_DECL extern cond_cat_type cond_cat;
 
 } // detail
 
-//-----------------------------------------------
-
+/// Create an error_condition from a cond value.
 inline
 std::error_condition
 make_error_condition(
diff --git a/include/boost/capy/continuation.hpp b/include/boost/capy/continuation.hpp
new file mode 100644
index 000000000..765d05853
--- /dev/null
+++ b/include/boost/capy/continuation.hpp
@@ -0,0 +1,78 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_CONTINUATION_HPP
+#define BOOST_CAPY_CONTINUATION_HPP
+
+#include <boost/capy/detail/config.hpp>
+
+#include <coroutine>
+
+namespace boost {
+namespace capy {
+
+/** Executor-facing schedulable unit.
+
+    Wraps a `std::coroutine_handle<>` with an intrusive list
+    pointer so executors can queue continuations without
+    per-post heap allocation.
+
+    @par Fields
+
+    @li `h` — the coroutine handle to resume. Set by the
+        code that creates or reuses the continuation (typically
+        an I/O awaitable or combinator). Read by the executor
+        when it dequeues the continuation.
+
+    @li `next` — intrusive linked-list pointer, owned and
+        managed exclusively by executor implementations. Users
+        must not read or write `next` while the continuation
+        is enqueued.
+
+    @par Ownership and Lifetime
+
+    The continuation is owned by the site that embeds it (an
+    I/O awaitable, combinator state, or trampoline promise).
+    The executor borrows it by reference for the duration of
+    the queue residency.
+
+    A continuation must have a **stable address** while it is
+    linked into an executor's queue. It must not be moved,
+    destroyed, or enqueued in more than one queue concurrently.
+
+    @par Copy and Move
+
+    Trivially copyable and movable (aggregate of a handle and
+    a pointer). However, copying or moving a queued
+    continuation produces a second object whose `next` is
+    stale — the executor still points to the original. Copy
+    and move are safe only when the continuation is not
+    enqueued.
+
+    @par Thread Safety
+
+    A single continuation must not be accessed concurrently
+    without external synchronization. In practice, the
+    creating thread sets `h` and calls `executor.post(c)`;
+    the executor's worker thread later reads `h` and calls
+    `h.resume()`. The executor's internal locking provides
+    the necessary synchronization between these two accesses.
+
+    @see Executor, executor_ref
+*/
+struct continuation
+{
+    std::coroutine_handle<> h;
+    continuation* next = nullptr;
+};
+
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/include/boost/capy/delay.hpp b/include/boost/capy/delay.hpp
new file mode 100644
index 000000000..279fd9f44
--- /dev/null
+++ b/include/boost/capy/delay.hpp
@@ -0,0 +1,233 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_DELAY_HPP
+#define BOOST_CAPY_DELAY_HPP
+
+#include <boost/capy/detail/config.hpp>
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/error.hpp>
+#include <boost/capy/ex/executor_ref.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/ex/detail/timer_service.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <atomic>
+#include <chrono>
+#include <coroutine>
+#include <new>
+#include <stop_token>
+#include <utility>
+
+namespace boost {
+namespace capy {
+
+/** IoAwaitable returned by @ref delay.
+
+    Suspends the calling coroutine until the deadline elapses
+    or the environment's stop token is activated, whichever
+    comes first. Resumption is always posted through the
+    executor, never inline on the timer thread.
+
+    Not intended to be named directly; use the @ref delay
+    factory function instead.
+
+    @par Return Value
+
+    Returns `io_result<>{}` (no error) when the timer fires
+    normally, or `io_result<>{error::canceled}` when
+    cancellation claims the resume before the deadline.
+
+    @par Cancellation
+
+    If `stop_requested()` is true before suspension, the
+    coroutine resumes immediately without scheduling a timer
+    and returns `io_result<>{error::canceled}`. If stop is
+    requested while suspended, the stop callback claims the
+    resume and posts it through the executor; the pending
+    timer is cancelled on the next `await_resume` or
+    destructor call.
+
+    @par Thread Safety
+
+    A single `delay_awaitable` must not be awaited concurrently.
+    Multiple independent `delay()` calls on the same
+    execution_context are safe and share one timer thread.
+
+    @see delay, timeout
+*/
+class delay_awaitable
+{
+    std::chrono::nanoseconds dur_;
+
+    detail::timer_service* ts_ = nullptr;
+    detail::timer_service::timer_id tid_ = 0;
+
+    // Declared before stop_cb_buf_: the callback
+    // accesses these members, so they must still be
+    // alive if the stop_cb_ destructor blocks.
+    continuation cont_;
+    std::atomic<bool> claimed_{false};
+    bool canceled_ = false;
+    bool stop_cb_active_ = false;
+
+    struct cancel_fn
+    {
+        delay_awaitable* self_;
+        executor_ref ex_;
+
+        void operator()() const noexcept
+        {
+            if(!self_->claimed_.exchange(
+                true, std::memory_order_acq_rel))
+            {
+                self_->canceled_ = true;
+                ex_.post(self_->cont_);
+            }
+        }
+    };
+
+    using stop_cb_t = std::stop_callback<cancel_fn>;
+
+    // Aligned storage for the stop callback.
+    // Declared last: its destructor may block while
+    // the callback accesses the members above.
+    BOOST_CAPY_MSVC_WARNING_PUSH
+    BOOST_CAPY_MSVC_WARNING_DISABLE(4324)
+    alignas(stop_cb_t)
+        unsigned char stop_cb_buf_[sizeof(stop_cb_t)];
+    BOOST_CAPY_MSVC_WARNING_POP
+
+    stop_cb_t& stop_cb_() noexcept
+    {
+        return *reinterpret_cast<stop_cb_t*>(stop_cb_buf_);
+    }
+
+public:
+    explicit delay_awaitable(std::chrono::nanoseconds dur) noexcept
+        : dur_(dur)
+    {
+    }
+
+    /// @pre The stop callback must not be active
+    ///      (i.e. the object has not yet been awaited).
+    delay_awaitable(delay_awaitable&& o) noexcept
+        : dur_(o.dur_)
+        , ts_(o.ts_)
+        , tid_(o.tid_)
+        , cont_(o.cont_)
+        , claimed_(o.claimed_.load(std::memory_order_relaxed))
+        , canceled_(o.canceled_)
+        , stop_cb_active_(std::exchange(o.stop_cb_active_, false))
+    {
+    }
+
+    ~delay_awaitable()
+    {
+        if(stop_cb_active_)
+            stop_cb_().~stop_cb_t();
+        if(ts_)
+            ts_->cancel(tid_);
+    }
+
+    delay_awaitable(delay_awaitable const&) = delete;
+    delay_awaitable& operator=(delay_awaitable const&) = delete;
+    delay_awaitable& operator=(delay_awaitable&&) = delete;
+
+    bool await_ready() const noexcept
+    {
+        return dur_.count() <= 0;
+    }
+
+    std::coroutine_handle<>
+    await_suspend(
+        std::coroutine_handle<> h,
+        io_env const* env) noexcept
+    {
+        // Already stopped: resume immediately
+        if(env->stop_token.stop_requested())
+        {
+            canceled_ = true;
+            return h;
+        }
+
+        cont_.h = h;
+        ts_ = &env->executor.context().use_service<detail::timer_service>();
+
+        // Schedule timer (won't fire inline since deadline is in the future)
+        tid_ = ts_->schedule_after(dur_,
+            [this, ex = env->executor]()
+            {
+                if(!claimed_.exchange(
+                    true, std::memory_order_acq_rel))
+                {
+                    ex.post(cont_);
+                }
+            });
+
+        // Register stop callback (may fire inline)
+        ::new(stop_cb_buf_) stop_cb_t(
+            env->stop_token,
+            cancel_fn{this, env->executor});
+        stop_cb_active_ = true;
+
+        return std::noop_coroutine();
+    }
+
+    io_result<> await_resume() noexcept
+    {
+        if(stop_cb_active_)
+        {
+            stop_cb_().~stop_cb_t();
+            stop_cb_active_ = false;
+        }
+        if(ts_)
+            ts_->cancel(tid_);
+        if(canceled_)
+            return io_result<>{make_error_code(error::canceled)};
+        return io_result<>{};
+    }
+};
+
+/** Suspend the current coroutine for a duration.
+
+    Returns an IoAwaitable that completes at or after the
+    specified duration, or earlier if the environment's stop
+    token is activated.
+
+    Zero or negative durations complete synchronously without
+    scheduling a timer.
+
+    @par Example
+    @code
+    auto [ec] = co_await delay(std::chrono::milliseconds(100));
+    @endcode
+
+    @param dur The duration to wait.
+
+    @return A @ref delay_awaitable whose `await_resume`
+        returns `io_result<>`. On normal completion, `ec`
+        is clear. On cancellation, `ec == error::canceled`.
+
+    @throws Nothing.
+
+    @see timeout, delay_awaitable
+*/
+template<typename Rep, typename Period>
+delay_awaitable
+delay(std::chrono::duration<Rep, Period> dur) noexcept
+{
+    return delay_awaitable{
+        std::chrono::duration_cast<std::chrono::nanoseconds>(dur)};
+}
+
+} // capy
+} // boost
+
+#endif
diff --git a/include/boost/capy/detail/await_suspend_helper.hpp b/include/boost/capy/detail/await_suspend_helper.hpp
index 5115900d8..c2d704ccb 100644
--- a/include/boost/capy/detail/await_suspend_helper.hpp
+++ b/include/boost/capy/detail/await_suspend_helper.hpp
@@ -1,5 +1,6 @@
 //
 // Copyright (c) 2025 Vinnie Falco (vinnie.falco@gmail.com)
+// Copyright (c) 2026 Steve Gerbino
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -19,6 +20,54 @@ namespace boost {
 namespace capy {
 namespace detail {
 
+/** Perform symmetric transfer, working around an MSVC codegen bug.
+
+    MSVC stores the `std::coroutine_handle<>` returned from
+    `await_suspend` in a hidden `__$ReturnUdt$` variable located
+    on the coroutine frame. When another thread resumes or destroys
+    the frame between the store and the read-back for the
+    symmetric-transfer tail-call, the read hits freed memory.
+
+    This occurs in two scenarios:
+
+    @li `await_suspend` calls `h.destroy()` then returns a handle
+        (e.g. `when_all_runner` and `when_any_runner` final_suspend).
+        The return value is written to the now-destroyed frame.
+
+    @li `await_suspend` hands the continuation to another thread
+        via an executor handoff (e.g. `post()` or `dispatch()`),
+        which may resume the parent. The parent can destroy this
+        frame before the runtime reads `__$ReturnUdt$` (e.g.
+        `boundary_trampoline` final_suspend).
+
+    On MSVC this function calls `h.resume()` on the current stack
+    and returns `void`, causing unconditional suspension. The
+    trade-off is O(n) stack growth instead of O(1) tail-calls.
+
+    On other compilers the handle is returned directly for proper
+    symmetric transfer.
+
+    Callers must use `auto` return type on their `await_suspend`
+    so the return type adapts per platform.
+
+    @param h The coroutine handle to transfer to.
+*/
+#if BOOST_CAPY_WORKAROUND(_MSC_VER, >= 1)
+inline void symmetric_transfer(std::coroutine_handle<> h) noexcept
+{
+    // safe_resume is not needed here: the calling coroutine is
+    // about to suspend unconditionally. When it later resumes,
+    // await_resume restores TLS from the promise's environment.
+    h.resume();
+}
+#else
+inline std::coroutine_handle<>
+symmetric_transfer(std::coroutine_handle<> h) noexcept
+{
+    return h;
+}
+#endif
+
 // Helper to normalize await_suspend return types to std::coroutine_handle<>
 template<typename Awaitable>
 std::coroutine_handle<> call_await_suspend(
diff --git a/include/boost/capy/buffers/buffer_array.hpp b/include/boost/capy/detail/buffer_array.hpp
similarity index 80%
rename from include/boost/capy/buffers/buffer_array.hpp
rename to include/boost/capy/detail/buffer_array.hpp
index 731c8b2db..2d0155e14 100644
--- a/include/boost/capy/buffers/buffer_array.hpp
+++ b/include/boost/capy/detail/buffer_array.hpp
@@ -7,8 +7,8 @@
 // Official repository: https://github.com/cppalliance/capy
 //
 
-#ifndef BOOST_CAPY_BUFFERS_BUFFER_ARRAY_HPP
-#define BOOST_CAPY_BUFFERS_BUFFER_ARRAY_HPP
+#ifndef BOOST_CAPY_DETAIL_BUFFER_ARRAY_HPP
+#define BOOST_CAPY_DETAIL_BUFFER_ARRAY_HPP
 
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/detail/except.hpp>
@@ -21,43 +21,8 @@
 
 namespace boost {
 namespace capy {
-
 namespace detail {
 
-BOOST_CAPY_DECL
-void
-buffer_array_remove_prefix(
-    const_buffer* arr,
-    std::size_t* count,
-    std::size_t* total_size,
-    std::size_t n) noexcept;
-
-BOOST_CAPY_DECL
-void
-buffer_array_remove_prefix(
-    mutable_buffer* arr,
-    std::size_t* count,
-    std::size_t* total_size,
-    std::size_t n) noexcept;
-
-BOOST_CAPY_DECL
-void
-buffer_array_keep_prefix(
-    const_buffer* arr,
-    std::size_t* count,
-    std::size_t* total_size,
-    std::size_t n) noexcept;
-
-BOOST_CAPY_DECL
-void
-buffer_array_keep_prefix(
-    mutable_buffer* arr,
-    std::size_t* count,
-    std::size_t* total_size,
-    std::size_t n) noexcept;
-
-} // namespace detail
-
 /** A buffer sequence holding up to N buffers.
 
     This class template stores a fixed-capacity array of buffer
@@ -65,18 +30,17 @@ buffer_array_keep_prefix(
     It provides efficient storage for small buffer sequences
     without dynamic allocation.
 
-    @tparam N Maximum number of buffers the array can hold.
-    @tparam IsConst If true, holds const_buffer; otherwise mutable_buffer.
-
-    @par Usage
-
+    @par Example
     @code
     void process(ConstBufferSequence auto const& buffers)
     {
-        const_buffer_array<4> bufs(buffers);
+        detail::const_buffer_array<4> bufs(buffers);
         // use bufs.begin(), bufs.end(), bufs.to_span()
     }
     @endcode
+
+    @tparam N Maximum number of buffers the array can hold.
+    @tparam IsConst If true, holds const_buffer; otherwise mutable_buffer.
 */
 template<std::size_t N, bool IsConst>
 class buffer_array
@@ -95,7 +59,7 @@ class buffer_array
     };
 
 public:
-    /** Default constructor.
+    /** Construct a default instance.
 
         Constructs an empty buffer array.
     */
@@ -104,7 +68,7 @@ class buffer_array
     {
     }
 
-    /** Copy constructor.
+    /** Construct a copy.
     */
     buffer_array(buffer_array const& other) noexcept
         : n_(other.n_)
@@ -252,7 +216,7 @@ class buffer_array
             arr_[n_].~value_type();
     }
 
-    /** Copy assignment.
+    /** Assign by copying.
     */
     buffer_array&
     operator=(buffer_array const& other) noexcept
@@ -333,61 +297,13 @@ class buffer_array
 
     /** Return the total byte count in O(1).
     */
-    friend
     std::size_t
-    tag_invoke(
-        size_tag const&,
-        buffer_array const& ba) noexcept
-    {
-        return ba.size_;
-    }
-
-    /** Slice customization point.
-    */
-    friend
-    void
-    tag_invoke(
-        slice_tag const&,
-        buffer_array& ba,
-        slice_how how,
-        std::size_t n) noexcept
+    byte_size() const noexcept
     {
-        ba.slice_impl(how, n);
-    }
-
-private:
-    void
-    slice_impl(
-        slice_how how,
-        std::size_t n) noexcept
-    {
-        switch(how)
-        {
-        case slice_how::remove_prefix:
-            remove_prefix_impl(n);
-            break;
-
-        case slice_how::keep_prefix:
-            keep_prefix_impl(n);
-            break;
-        }
-    }
-
-    void
-    remove_prefix_impl(std::size_t n) noexcept
-    {
-        detail::buffer_array_remove_prefix(arr_, &n_, &size_, n);
-    }
-
-    void
-    keep_prefix_impl(std::size_t n) noexcept
-    {
-        detail::buffer_array_keep_prefix(arr_, &n_, &size_, n);
+        return size_;
     }
 };
 
-//------------------------------------------------
-
 /** Alias for buffer_array holding const_buffer.
 
     @tparam N Maximum number of buffers.
@@ -402,6 +318,7 @@ using const_buffer_array = buffer_array<N, true>;
 template<std::size_t N>
 using mutable_buffer_array = buffer_array<N, false>;
 
+} // namespace detail
 } // namespace capy
 } // namespace boost
 
diff --git a/include/boost/capy/detail/config.hpp b/include/boost/capy/detail/config.hpp
index 9c22db96d..6ea93575c 100644
--- a/include/boost/capy/detail/config.hpp
+++ b/include/boost/capy/detail/config.hpp
@@ -15,6 +15,58 @@
 # define BOOST_CAPY_ASSERT(expr) assert(expr)
 #endif
 
+//------------------------------------------------
+//
+// Compiler bug workarounds
+//
+//------------------------------------------------
+
+/* Standalone workaround macro modeled on Boost.Config's BOOST_WORKAROUND.
+
+   Guard mechanism: when a compiler symbol is not defined, the
+   corresponding _WORKAROUND_GUARD macro is 1, which makes
+   BOOST_CAPY_WORKAROUND evaluate to 0 on that compiler.
+
+   Usage:
+     #if BOOST_CAPY_WORKAROUND(_MSC_VER, >= 1)      // any MSVC
+     #if BOOST_CAPY_WORKAROUND(_MSC_VER, <= 1900)    // MSVC 14.0 and earlier
+     #if BOOST_CAPY_WORKAROUND(__GNUC__, < 12)        // GCC before 12
+*/
+
+#ifndef _MSC_VER
+# define _MSC_VER_WORKAROUND_GUARD 1
+#else
+# define _MSC_VER_WORKAROUND_GUARD 0
+#endif
+
+#ifndef __GNUC__
+# define __GNUC___WORKAROUND_GUARD 1
+#else
+# define __GNUC___WORKAROUND_GUARD 0
+#endif
+
+#ifndef __clang_major__
+# define __clang_major___WORKAROUND_GUARD 1
+#else
+# define __clang_major___WORKAROUND_GUARD 0
+#endif
+
+#define BOOST_CAPY_WORKAROUND(symbol, test)        \
+    ((symbol ## _WORKAROUND_GUARD + 0 == 0) &&     \
+     (symbol != 0) && (1 % (( (symbol test) ) + 1)))
+
+// MSVC warning suppression helpers.
+// On MSVC these expand to __pragma(); elsewhere they are empty.
+#ifdef _MSC_VER
+# define BOOST_CAPY_MSVC_WARNING_PUSH       __pragma(warning(push))
+# define BOOST_CAPY_MSVC_WARNING_DISABLE(x) __pragma(warning(disable: x))
+# define BOOST_CAPY_MSVC_WARNING_POP        __pragma(warning(pop))
+#else
+# define BOOST_CAPY_MSVC_WARNING_PUSH
+# define BOOST_CAPY_MSVC_WARNING_DISABLE(x)
+# define BOOST_CAPY_MSVC_WARNING_POP
+#endif
+
 // Efficient thread-local storage keyword for POD types
 #if !defined(BOOST_CAPY_TLS_KEYWORD)
 # if defined(_MSC_VER)
@@ -49,13 +101,17 @@
 #endif
 
 // RTTI detection (user may predefine BOOST_CAPY_NO_RTTI)
+//
+// _MSC_VER must be checked before __clang__ because Clang-CL defines
+// both __clang__ and _MSC_VER, but uses the MSVC-style _CPPRTTI macro
+// (not the GCC-style __GXX_RTTI) to signal RTTI availability.
 #ifndef BOOST_CAPY_NO_RTTI
-# if defined(__GNUC__) || defined(__clang__)
-#  ifndef __GXX_RTTI
+# if defined(_MSC_VER)
+#  ifndef _CPPRTTI
 #   define BOOST_CAPY_NO_RTTI 1
 #  endif
-# elif defined(_MSC_VER)
-#  ifndef _CPPRTTI
+# elif defined(__GNUC__) || defined(__clang__)
+#  ifndef __GXX_RTTI
 #   define BOOST_CAPY_NO_RTTI 1
 #  endif
 # endif
@@ -76,13 +132,20 @@
 # define BOOST_CAPY_DECL
 #endif
 
-// Clang 20+ supports coro_await_elidable for heap elision
-#if defined(__clang__) && !defined(__apple_build_version__) && __clang_major__ >= 20
+// Heap elision: compiler may allocate elided coroutine frames on the caller's frame
+#if __has_cpp_attribute(clang::coro_await_elidable)
 #define BOOST_CAPY_CORO_AWAIT_ELIDABLE [[clang::coro_await_elidable]]
 #else
 #define BOOST_CAPY_CORO_AWAIT_ELIDABLE
 #endif
 
+// Simpler destroy codegen for coroutines that always run to completion
+#if __has_cpp_attribute(clang::coro_only_destroy_when_complete)
+#define BOOST_CAPY_CORO_DESTROY_WHEN_COMPLETE [[clang::coro_only_destroy_when_complete]]
+#else
+#define BOOST_CAPY_CORO_DESTROY_WHEN_COMPLETE
+#endif
+
 namespace boost::capy::detail {
 inline constexpr unsigned max_iovec_ = 16;
 }
diff --git a/include/boost/capy/detail/io_result_combinators.hpp b/include/boost/capy/detail/io_result_combinators.hpp
new file mode 100644
index 000000000..fb7dc3aa8
--- /dev/null
+++ b/include/boost/capy/detail/io_result_combinators.hpp
@@ -0,0 +1,141 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_DETAIL_IO_RESULT_COMBINATORS_HPP
+#define BOOST_CAPY_DETAIL_IO_RESULT_COMBINATORS_HPP
+
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/io_result.hpp>
+
+#include <system_error>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace boost {
+namespace capy {
+namespace detail {
+
+template<typename T>
+struct is_io_result : std::false_type {};
+
+template<typename... Args>
+struct is_io_result<io_result<Args...>> : std::true_type {};
+
+template<typename T>
+inline constexpr bool is_io_result_v = is_io_result<T>::value;
+
+/// True when every awaitable in the pack returns an io_result.
+template<typename... As>
+concept all_io_result_awaitables =
+    (is_io_result_v<awaitable_result_t<As>> && ...);
+
+/// True when the io_result-aware when_all overload should be used.
+template<typename... As>
+concept when_all_io_eligible =
+    (sizeof...(As) > 0)
+    && all_io_result_awaitables<As...>;
+
+/// True when the io_result-aware when_any overload should be used.
+template<typename... As>
+concept when_any_io_eligible =
+    (sizeof...(As) > 0)
+    && all_io_result_awaitables<As...>;
+
+/// Map an io_result specialization to its contributed payload type.
+///
+///   io_result<T>       -> T            (unwrap single)
+///   io_result<Ts...>   -> tuple<Ts...> (zero, two, or more)
+template<typename IoResult>
+struct io_result_payload;
+
+template<typename T>
+struct io_result_payload<io_result<T>>
+{
+    using type = T;
+};
+
+template<typename... Ts>
+struct io_result_payload<io_result<Ts...>>
+{
+    using type = std::tuple<Ts...>;
+};
+
+template<typename IoResult>
+using io_result_payload_t =
+    typename io_result_payload<IoResult>::type;
+
+/// Extract the payload value(s) from an io_result,
+/// matching the type produced by io_result_payload_t.
+template<typename T>
+T
+extract_io_payload(io_result<T>&& r)
+{
+    return std::get<0>(std::move(r.values));
+}
+
+template<typename... Ts>
+std::tuple<Ts...>
+extract_io_payload(io_result<Ts...>&& r)
+{
+    return std::move(r.values);
+}
+
+/// Reconstruct a success io_result from a payload extracted by when_any.
+template<typename IoResult>
+struct io_result_from_payload;
+
+template<typename T>
+struct io_result_from_payload<io_result<T>>
+{
+    static io_result<T> apply(T t)
+    {
+        return io_result<T>{{}, std::move(t)};
+    }
+};
+
+template<typename... Ts>
+struct io_result_from_payload<io_result<Ts...>>
+{
+    static io_result<Ts...> apply(std::tuple<Ts...> t)
+    {
+        return std::apply([](auto&&... args) {
+            return io_result<Ts...>{{}, std::move(args)...};
+        }, std::move(t));
+    }
+};
+
+/// Build the outer io_result for when_all from a tuple of child io_results.
+template<typename ResultType, typename Tuple, std::size_t... Is>
+ResultType
+build_when_all_io_result_impl(Tuple&& results, std::index_sequence<Is...>)
+{
+    std::error_code ec;
+    (void)((std::get<Is>(results).ec && !ec
+        ? (ec = std::get<Is>(results).ec, true)
+        : false) || ...);
+
+    return ResultType{ec, extract_io_payload(
+        std::move(std::get<Is>(results)))...};
+}
+
+template<typename ResultType, typename... IoResults>
+ResultType
+build_when_all_io_result(std::tuple<IoResults...>&& results)
+{
+    return build_when_all_io_result_impl<ResultType>(
+        std::move(results),
+        std::index_sequence_for<IoResults...>{});
+}
+
+} // namespace detail
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/include/boost/capy/detail/slice_impl.hpp b/include/boost/capy/detail/slice_impl.hpp
new file mode 100644
index 000000000..86d77c943
--- /dev/null
+++ b/include/boost/capy/detail/slice_impl.hpp
@@ -0,0 +1,305 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+/*
+    Implementation type for the public buffer_slice() free function.
+    Users see this only via auto + the Slice concept; the type is
+    documented as unspecified. Maintained alongside Slice in
+    include/boost/capy/concept/slice.hpp.
+*/
+
+#ifndef BOOST_CAPY_DETAIL_SLICE_IMPL_HPP
+#define BOOST_CAPY_DETAIL_SLICE_IMPL_HPP
+
+#include <boost/capy/detail/config.hpp>
+#include <boost/capy/buffers.hpp>
+
+#include <cstddef>
+#include <iterator>
+#include <type_traits>
+
+namespace boost {
+namespace capy {
+namespace detail {
+
+template<class T>
+struct slice_buffer_type_for;
+
+template<MutableBufferSequence T>
+struct slice_buffer_type_for<T>
+{
+    using type = mutable_buffer;
+};
+
+template<ConstBufferSequence T>
+    requires (!MutableBufferSequence<T>)
+struct slice_buffer_type_for<T>
+{
+    using type = const_buffer;
+};
+
+template<class BufferSequence>
+    requires MutableBufferSequence<BufferSequence>
+          || ConstBufferSequence<BufferSequence>
+class slice_impl
+{
+public:
+    using iterator_type =
+        decltype(capy::begin(std::declval<BufferSequence const&>()));
+    using end_iterator_type =
+        decltype(capy::end(std::declval<BufferSequence const&>()));
+    using buffer_type =
+        typename slice_buffer_type_for<BufferSequence>::type;
+
+private:
+    iterator_type first_{};
+    end_iterator_type last_{};
+    std::size_t front_skip_ = 0;
+    std::size_t back_skip_ = 0;
+
+    static buffer_type adjust_buffer(
+        buffer_type const& buf,
+        std::size_t front_n,
+        std::size_t back_n) noexcept
+    {
+        if constexpr (std::is_same_v<buffer_type, mutable_buffer>)
+        {
+            return mutable_buffer(
+                static_cast<char*>(buf.data()) + front_n,
+                buf.size() - front_n - back_n);
+        }
+        else
+        {
+            return const_buffer(
+                static_cast<char const*>(buf.data()) + front_n,
+                buf.size() - front_n - back_n);
+        }
+    }
+
+public:
+    /// View returned by `slice_impl::data()`.
+    class data_view
+    {
+        iterator_type first_{};
+        end_iterator_type last_{};
+        std::size_t front_skip_ = 0;
+        std::size_t back_skip_ = 0;
+
+    public:
+        class const_iterator
+        {
+            iterator_type cur_{};
+            iterator_type anchor_first_{};
+            end_iterator_type anchor_last_{};
+            std::size_t front_skip_ = 0;
+            std::size_t back_skip_ = 0;
+
+        public:
+            using iterator_category = std::bidirectional_iterator_tag;
+            using value_type = buffer_type;
+            using difference_type = std::ptrdiff_t;
+            using pointer = value_type*;
+            using reference = value_type;
+
+            const_iterator() noexcept = default;
+
+            const_iterator(
+                iterator_type cur,
+                iterator_type anchor_first,
+                end_iterator_type anchor_last,
+                std::size_t front_skip,
+                std::size_t back_skip) noexcept
+                : cur_(cur)
+                , anchor_first_(anchor_first)
+                , anchor_last_(anchor_last)
+                , front_skip_(front_skip)
+                , back_skip_(back_skip)
+            {
+            }
+
+            bool operator==(const_iterator const& other) const noexcept
+            {
+                return cur_ == other.cur_;
+            }
+
+            bool operator!=(const_iterator const& other) const noexcept
+            {
+                return !(*this == other);
+            }
+
+            value_type operator*() const noexcept
+            {
+                buffer_type buf = *cur_;
+                auto front_n = (cur_ == anchor_first_) ? front_skip_ : 0;
+                auto next = cur_;
+                ++next;
+                auto back_n = (next == anchor_last_) ? back_skip_ : 0;
+                return adjust_buffer(buf, front_n, back_n);
+            }
+
+            const_iterator& operator++() noexcept
+            {
+                ++cur_;
+                return *this;
+            }
+
+            const_iterator operator++(int) noexcept
+            {
+                const_iterator tmp = *this;
+                ++*this;
+                return tmp;
+            }
+
+            const_iterator& operator--() noexcept
+            {
+                --cur_;
+                return *this;
+            }
+
+            const_iterator operator--(int) noexcept
+            {
+                const_iterator tmp = *this;
+                --*this;
+                return tmp;
+            }
+        };
+
+        data_view() noexcept = default;
+
+        data_view(
+            iterator_type first,
+            end_iterator_type last,
+            std::size_t front_skip,
+            std::size_t back_skip) noexcept
+            : first_(first)
+            , last_(last)
+            , front_skip_(front_skip)
+            , back_skip_(back_skip)
+        {
+        }
+
+        const_iterator begin() const noexcept
+        {
+            return const_iterator(
+                first_, first_, last_, front_skip_, back_skip_);
+        }
+
+        const_iterator end() const noexcept
+        {
+            return const_iterator(
+                last_, first_, last_, front_skip_, back_skip_);
+        }
+    };
+
+    slice_impl() noexcept = default;
+
+    explicit slice_impl(BufferSequence const& bs) noexcept
+        : first_(capy::begin(bs))
+        , last_(capy::end(bs))
+    {
+    }
+
+    slice_impl(
+        BufferSequence const& bs,
+        std::size_t offset,
+        std::size_t length) noexcept
+    {
+        auto it_begin = capy::begin(bs);
+        auto it_end = capy::end(bs);
+
+        std::size_t total = 0;
+        for (auto it = it_begin; it != it_end; ++it)
+            total += (*it).size();
+
+        if (offset > total)
+            offset = total;
+        std::size_t const remaining = total - offset;
+        if (length > remaining)
+            length = remaining;
+
+        first_ = it_begin;
+        last_ = it_end;
+
+        std::size_t skip = offset;
+        while (first_ != last_)
+        {
+            std::size_t const buf_size = (*first_).size();
+            if (skip < buf_size)
+            {
+                front_skip_ = skip;
+                break;
+            }
+            skip -= buf_size;
+            ++first_;
+        }
+
+        std::size_t left = length;
+        auto cursor = first_;
+        std::size_t cursor_front = front_skip_;
+        while (cursor != last_ && left > 0)
+        {
+            std::size_t const buf_size = (*cursor).size();
+            std::size_t const avail = buf_size - cursor_front;
+            if (left <= avail)
+            {
+                back_skip_ = avail - left;
+                ++cursor;
+                last_ = cursor;
+                return;
+            }
+            left -= avail;
+            ++cursor;
+            cursor_front = 0;
+        }
+
+        last_ = cursor;
+    }
+
+    data_view data() const noexcept
+    {
+        return data_view(first_, last_, front_skip_, back_skip_);
+    }
+
+    void remove_prefix(std::size_t n) noexcept
+    {
+        while (n > 0 && first_ != last_)
+        {
+            std::size_t const buf_total = (*first_).size();
+            std::size_t live = buf_total - front_skip_;
+            auto next = first_;
+            ++next;
+            bool const is_last = (next == last_);
+            if (is_last)
+                live -= back_skip_;
+
+            if (n < live)
+            {
+                front_skip_ += n;
+                return;
+            }
+
+            n -= live;
+            if (is_last)
+            {
+                first_ = last_;
+                front_skip_ = 0;
+                back_skip_ = 0;
+                return;
+            }
+            ++first_;
+            front_skip_ = 0;
+        }
+    }
+};
+
+} // namespace detail
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/include/boost/capy/detail/stop_requested_exception.hpp b/include/boost/capy/detail/stop_requested_exception.hpp
new file mode 100644
index 000000000..f5bc47e98
--- /dev/null
+++ b/include/boost/capy/detail/stop_requested_exception.hpp
@@ -0,0 +1,28 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_DETAIL_STOP_REQUESTED_EXCEPTION_HPP
+#define BOOST_CAPY_DETAIL_STOP_REQUESTED_EXCEPTION_HPP
+
+namespace boost {
+namespace capy {
+namespace detail {
+
+/* Lightweight sentinel thrown inside quitter<T> when the stop token
+   is triggered. Not derived from std::exception. Never escapes the
+   coroutine — unhandled_exception() catches it and sets the stopped
+   flag. The cost is one throw+catch per cancellation per coroutine
+   lifetime. */
+struct stop_requested_exception {};
+
+} // namespace detail
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/include/boost/capy/error.hpp b/include/boost/capy/error.hpp
index 0e010eb86..29ec382ac 100644
--- a/include/boost/capy/error.hpp
+++ b/include/boost/capy/error.hpp
@@ -43,10 +43,11 @@ enum class error
     stream_truncated,
 
     /// Requested item was not found. Compare with `cond::not_found`.
-    not_found
-};
+    not_found,
 
-//-----------------------------------------------
+    /// Operation timed out. Compare with `cond::timeout`.
+    timeout
+};
 
 } // capy
 } // boost
@@ -61,8 +62,6 @@ struct is_error_code_enum<
 namespace boost {
 namespace capy {
 
-//-----------------------------------------------
-
 namespace detail {
  
 struct BOOST_CAPY_SYMBOL_VISIBLE
@@ -80,8 +79,7 @@ BOOST_CAPY_DECL extern error_cat_type error_cat;
 
 } // detail
 
-//-----------------------------------------------
-
+/// Create an error_code from an error value.
 inline
 std::error_code
 make_error_code(
diff --git a/include/boost/capy/ex/any_executor.hpp b/include/boost/capy/ex/any_executor.hpp
index 0fa5a357e..fa8cad49f 100644
--- a/include/boost/capy/ex/any_executor.hpp
+++ b/include/boost/capy/ex/any_executor.hpp
@@ -11,6 +11,7 @@
 #define BOOST_CAPY_ANY_EXECUTOR_HPP
 
 #include <boost/capy/detail/config.hpp>
+#include <boost/capy/continuation.hpp>
 #include <concepts>
 #include <coroutine>
 #include <memory>
@@ -91,8 +92,8 @@ class any_executor
         virtual execution_context& context() const noexcept = 0;
         virtual void on_work_started() const noexcept = 0;
         virtual void on_work_finished() const noexcept = 0;
-        virtual std::coroutine_handle<> dispatch(std::coroutine_handle<>) const = 0;
-        virtual void post(std::coroutine_handle<>) const = 0;
+        virtual std::coroutine_handle<> dispatch(continuation&) const = 0;
+        virtual void post(continuation&) const = 0;
         virtual bool equals(impl_base const*) const noexcept = 0;
         virtual std::type_info const& target_type() const noexcept = 0;
     };
@@ -123,14 +124,14 @@ class any_executor
             ex_.on_work_finished();
         }
 
-        std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const override
+        std::coroutine_handle<> dispatch(continuation& c) const override
         {
-            return ex_.dispatch(h);
+            return ex_.dispatch(c);
         }
 
-        void post(std::coroutine_handle<> h) const override
+        void post(continuation& c) const override
         {
-            ex_.post(h);
+            ex_.post(c);
         }
 
         bool equals(impl_base const* other) const noexcept override
@@ -147,7 +148,7 @@ class any_executor
     };
 
 public:
-    /** Default constructor.
+    /** Construct a default instance.
 
         Constructs an empty `any_executor`. Calling any executor
         operations on a default-constructed instance results in
@@ -158,7 +159,7 @@ class any_executor
     */
     any_executor() = default;
 
-    /** Copy constructor.
+    /** Construct a copy.
 
         Creates a new `any_executor` sharing ownership of the
         underlying executor with `other`.
@@ -240,36 +241,39 @@ class any_executor
         p_->on_work_finished();
     }
 
-    /** Dispatches a coroutine handle through the wrapped executor.
+    /** Dispatches a continuation through the wrapped executor.
 
         Returns a handle for symmetric transfer. If running in the
-        executor's thread, returns `h`. Otherwise, posts the coroutine
-        for later execution and returns `std::noop_coroutine()`.
+        executor's thread, returns `c.h`. Otherwise, posts the
+        continuation for later execution and returns
+        `std::noop_coroutine()`.
 
-        @param h The coroutine handle to dispatch for resumption.
+        @param c The continuation to dispatch for resumption.
+                 Must remain at a stable address until dequeued.
 
         @return A handle for symmetric transfer or `std::noop_coroutine()`.
 
         @pre This instance holds a valid executor.
     */
-    std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const
+    std::coroutine_handle<> dispatch(continuation& c) const
     {
-        return p_->dispatch(h);
+        return p_->dispatch(c);
     }
 
-    /** Posts a coroutine handle to the wrapped executor.
+    /** Posts a continuation to the wrapped executor.
 
-        Posts the coroutine handle to the executor for later execution
+        Posts the continuation to the executor for later execution
         and returns. The caller should transfer to `std::noop_coroutine()`
         after calling this.
 
-        @param h The coroutine handle to post for resumption.
+        @param c The continuation to post for resumption.
+                 Must remain at a stable address until dequeued.
 
         @pre This instance holds a valid executor.
     */
-    void post(std::coroutine_handle<> h) const
+    void post(continuation& c) const
     {
-        p_->post(h);
+        p_->post(c);
     }
 
     /** Compares two executor wrappers for equality.
diff --git a/include/boost/capy/ex/async_event.hpp b/include/boost/capy/ex/async_event.hpp
index fa79c1851..ed254b2be 100644
--- a/include/boost/capy/ex/async_event.hpp
+++ b/include/boost/capy/ex/async_event.hpp
@@ -12,6 +12,7 @@
 
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/detail/intrusive.hpp>
+#include <boost/capy/continuation.hpp>
 #include <boost/capy/concept/executor.hpp>
 #include <boost/capy/error.hpp>
 #include <boost/capy/ex/io_env.hpp>
@@ -69,9 +70,15 @@ namespace capy {
 
     @par Thread Safety
 
+    Distinct objects: Safe.@n
+    Shared objects: Unsafe.
+
     The event operations are designed for single-threaded use on one
     executor. The stop callback may fire from any thread.
 
+    This type is non-copyable and non-movable because suspended
+    waiters hold intrusive pointers into the event's internal list.
+
     @par Example
     @code
     async_event event;
@@ -107,7 +114,7 @@ class async_event
         friend class async_event;
 
         async_event* e_;
-        std::coroutine_handle<> h_;
+        continuation cont_;
         executor_ref ex_;
 
         // Declared before stop_cb_buf_: the callback
@@ -128,7 +135,7 @@ class async_event
                     true, std::memory_order_acq_rel))
                 {
                     self_->canceled_ = true;
-                    self_->ex_.post(self_->h_);
+                    self_->ex_.post(self_->cont_);
                 }
             }
         };
@@ -139,15 +146,11 @@ class async_event
         // Aligned storage for stop_cb_t. Declared last:
         // its destructor may block while the callback
         // accesses the members above.
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4324) // padded due to alignas
-#endif
+        BOOST_CAPY_MSVC_WARNING_PUSH
+        BOOST_CAPY_MSVC_WARNING_DISABLE(4324) // padded due to alignas
         alignas(stop_cb_t)
             unsigned char stop_cb_buf_[sizeof(stop_cb_t)];
-#ifdef _MSC_VER
-# pragma warning(pop)
-#endif
+        BOOST_CAPY_MSVC_WARNING_POP
 
         stop_cb_t& stop_cb_() noexcept
         {
@@ -171,7 +174,7 @@ class async_event
 
         wait_awaiter(wait_awaiter&& o) noexcept
             : e_(o.e_)
-            , h_(o.h_)
+            , cont_(o.cont_)
             , ex_(o.ex_)
             , claimed_(o.claimed_.load(
                 std::memory_order_relaxed))
@@ -201,7 +204,7 @@ class async_event
                 canceled_ = true;
                 return h;
             }
-            h_ = h;
+            cont_.h = h;
             ex_ = env->executor;
             e_->waiters_.push_back(this);
             in_list_ = true;
@@ -232,17 +235,26 @@ class async_event
         }
     };
 
+    /// Construct an unset event.
     async_event() = default;
 
-    // Non-copyable, non-movable
+    /// Copy constructor (deleted).
     async_event(async_event const&) = delete;
+
+    /// Copy assignment (deleted).
     async_event& operator=(async_event const&) = delete;
 
+    /// Move constructor (deleted).
+    async_event(async_event&&) = delete;
+
+    /// Move assignment (deleted).
+    async_event& operator=(async_event&&) = delete;
+
     /** Returns an awaiter that waits until the event is set.
 
         If the event is already set, completes immediately.
 
-        @return An awaitable yielding `(error_code)`.
+        @return An awaitable that await-returns `(error_code)`.
     */
     wait_awaiter wait() noexcept
     {
@@ -267,7 +279,7 @@ class async_event
             if(!w->claimed_.exchange(
                 true, std::memory_order_acq_rel))
             {
-                w->ex_.post(w->h_);
+                w->ex_.post(w->cont_);
             }
         }
     }
diff --git a/include/boost/capy/ex/async_mutex.hpp b/include/boost/capy/ex/async_mutex.hpp
index 4e7898d95..90c9a4a13 100644
--- a/include/boost/capy/ex/async_mutex.hpp
+++ b/include/boost/capy/ex/async_mutex.hpp
@@ -12,6 +12,7 @@
 
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/detail/intrusive.hpp>
+#include <boost/capy/continuation.hpp>
 #include <boost/capy/concept/executor.hpp>
 #include <boost/capy/error.hpp>
 #include <boost/capy/ex/io_env.hpp>
@@ -113,9 +114,15 @@ namespace capy {
 
     @par Thread Safety
 
+    Distinct objects: Safe.@n
+    Shared objects: Unsafe.
+
     The mutex operations are designed for single-threaded use on one
     executor. The stop callback may fire from any thread.
 
+    This type is non-copyable and non-movable because suspended
+    waiters hold intrusive pointers into the mutex's internal list.
+
     @par Example
     @code
     async_mutex cm;
@@ -158,7 +165,7 @@ class async_mutex
         friend class async_mutex;
 
         async_mutex* m_;
-        std::coroutine_handle<> h_;
+        continuation cont_;
         executor_ref ex_;
 
         // These members must be declared before stop_cb_
@@ -177,7 +184,7 @@ class async_mutex
                     true, std::memory_order_acq_rel))
                 {
                     self_->canceled_ = true;
-                    self_->ex_.post(self_->h_);
+                    self_->ex_.post(self_->cont_);
                 }
             }
         };
@@ -188,15 +195,11 @@ class async_mutex
         // Aligned storage for stop_cb_t. Declared last:
         // its destructor may block while the callback
         // accesses the members above.
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4324) // padded due to alignas
-#endif
+        BOOST_CAPY_MSVC_WARNING_PUSH
+        BOOST_CAPY_MSVC_WARNING_DISABLE(4324) // padded due to alignas
         alignas(stop_cb_t)
             unsigned char stop_cb_buf_[sizeof(stop_cb_t)];
-#ifdef _MSC_VER
-# pragma warning(pop)
-#endif
+        BOOST_CAPY_MSVC_WARNING_POP
 
         stop_cb_t& stop_cb_() noexcept
         {
@@ -221,7 +224,7 @@ class async_mutex
 
         lock_awaiter(lock_awaiter&& o) noexcept
             : m_(o.m_)
-            , h_(o.h_)
+            , cont_(o.cont_)
             , ex_(o.ex_)
             , claimed_(o.claimed_.load(
                 std::memory_order_relaxed))
@@ -255,7 +258,7 @@ class async_mutex
                 canceled_ = true;
                 return h;
             }
-            h_ = h;
+            cont_.h = h;
             ex_ = env->executor;
             m_->waiters_.push_back(this);
             ::new(stop_cb_buf_) stop_cb_t(
@@ -367,15 +370,24 @@ class async_mutex
         }
     };
 
+    /// Construct an unlocked mutex.
     async_mutex() = default;
 
-    // Non-copyable, non-movable
+    /// Copy constructor (deleted).
     async_mutex(async_mutex const&) = delete;
+
+    /// Copy assignment (deleted).
     async_mutex& operator=(async_mutex const&) = delete;
 
+    /// Move constructor (deleted).
+    async_mutex(async_mutex&&) = delete;
+
+    /// Move assignment (deleted).
+    async_mutex& operator=(async_mutex&&) = delete;
+
     /** Returns an awaiter that acquires the mutex.
 
-        @return An awaitable yielding `(error_code)`.
+        @return An awaitable that await-returns `(error_code)`.
     */
     lock_awaiter lock() noexcept
     {
@@ -384,7 +396,7 @@ class async_mutex
 
     /** Returns an awaiter that acquires the mutex with RAII.
 
-        @return An awaitable yielding `(error_code,lock_guard)`.
+        @return An awaitable that await-returns `(error_code,lock_guard)`.
     */
     lock_guard_awaiter scoped_lock() noexcept
     {
@@ -411,7 +423,7 @@ class async_mutex
             if(!waiter->claimed_.exchange(
                 true, std::memory_order_acq_rel))
             {
-                waiter->ex_.post(waiter->h_);
+                waiter->ex_.post(waiter->cont_);
                 return;
             }
         }
diff --git a/include/boost/capy/ex/detail/strand_service.hpp b/include/boost/capy/ex/detail/strand_service.hpp
index 7e1960a29..8f4dc144c 100644
--- a/include/boost/capy/ex/detail/strand_service.hpp
+++ b/include/boost/capy/ex/detail/strand_service.hpp
@@ -10,12 +10,13 @@
 #ifndef BOOST_CAPY_EX_DETAIL_STRAND_SERVICE_HPP
 #define BOOST_CAPY_EX_DETAIL_STRAND_SERVICE_HPP
 
+#include <boost/capy/continuation.hpp>
 #include <boost/capy/detail/config.hpp>
 #include <coroutine>
 #include <boost/capy/ex/executor_ref.hpp>
 #include <boost/capy/ex/execution_context.hpp>
 
-#include <cstddef>
+#include <memory>
 
 namespace boost {
 namespace capy {
@@ -32,13 +33,11 @@ struct is_strand : std::false_type {};
 template<typename E>
 struct is_strand<strand<E>> : std::true_type {};
 
-//----------------------------------------------------------
+/** Service that manages strand implementations.
 
-/** Service that manages pooled strand implementations.
-
-    This service maintains a fixed pool of strand_impl objects.
-    When a strand is constructed, it obtains a pointer to one
-    of these pooled implementations based on a hash.
+    Allocates one `strand_impl` per strand. Maintains a shared pool of
+    mutexes that strand_impls borrow, sized to keep memory bounded as
+    strand count grows.
 
     @par Thread Safety
     The service operations are thread-safe.
@@ -51,16 +50,16 @@ class BOOST_CAPY_DECL strand_service
     */
     virtual ~strand_service();
 
-    /** Return a pointer to a pooled implementation.
+    /** Allocate a new strand implementation.
 
-        Uses a hash to select an implementation from the pool.
-        The salt is incremented after each call to distribute
-        strands across the pool.
+        Each call returns a fresh `strand_impl` owned by the returned
+        `shared_ptr`. The implementation borrows a mutex from the
+        service's shared pool.
 
-        @return Pointer to a strand_impl from the pool.
+        @return shared_ptr to the new strand_impl.
     */
-    virtual strand_impl*
-    get_implementation() = 0;
+    virtual std::shared_ptr<strand_impl>
+    create_implementation() = 0;
 
     /** Check if THIS thread is currently executing in the strand. */
     static bool
@@ -68,11 +67,17 @@ class BOOST_CAPY_DECL strand_service
 
     /** Dispatch through strand; returns handle for symmetric transfer. */
     static std::coroutine_handle<>
-    dispatch(strand_impl& impl, executor_ref ex, std::coroutine_handle<> h);
+    dispatch(
+        std::shared_ptr<strand_impl> const& impl,
+        executor_ref ex,
+        continuation& c);
 
     /** Post to strand queue. */
     static void
-    post(strand_impl& impl, executor_ref ex, std::coroutine_handle<> h);
+    post(
+        std::shared_ptr<strand_impl> const& impl,
+        executor_ref ex,
+        continuation& c);
 
 protected:
     strand_service();
diff --git a/include/boost/capy/ex/detail/timer_service.hpp b/include/boost/capy/ex/detail/timer_service.hpp
new file mode 100644
index 000000000..c383c539e
--- /dev/null
+++ b/include/boost/capy/ex/detail/timer_service.hpp
@@ -0,0 +1,128 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_EX_TIMER_SERVICE_HPP
+#define BOOST_CAPY_EX_TIMER_SERVICE_HPP
+
+#include <boost/capy/detail/config.hpp>
+#include <boost/capy/ex/execution_context.hpp>
+
+#include <chrono>
+#include <cstdint>
+#include <functional>
+#include <mutex>
+#include <condition_variable>
+#include <queue>
+#include <thread>
+#include <unordered_set>
+#include <vector>
+
+namespace boost {
+namespace capy {
+namespace detail {
+
+/* Shared timer thread for an execution_context.
+
+   One background std::thread per execution_context. All timeouts
+   scheduled through this context share the same thread, which sleeps
+   on a condition variable until the next deadline.
+
+   The timer thread never touches coroutine frames or executors
+   directly — callbacks are responsible for posting work through
+   the appropriate executor.
+*/
+
+class BOOST_CAPY_DECL
+    timer_service
+    : public execution_context::service
+{
+public:
+    using timer_id = std::uint64_t;
+
+    explicit timer_service(execution_context& ctx);
+
+    // Calls shutdown() to join the background thread.
+    // Handles the discard path in use_service_impl where
+    // a duplicate service is deleted without shutdown().
+    ~timer_service();
+
+    /** Schedule a callback to fire after a duration.
+
+        The callback is invoked on the timer service's background
+        thread. It must not block for extended periods.
+
+        @return An id that can be passed to cancel().
+    */
+    template<typename Rep, typename Period>
+    timer_id schedule_after(
+        std::chrono::duration<Rep, Period> dur,
+        std::function<void()> cb)
+    {
+        auto deadline = std::chrono::steady_clock::now() + dur;
+        return schedule_at(deadline, std::move(cb));
+    }
+
+    /** Cancel a pending timer.
+
+        After this function returns, the callback is guaranteed
+        not to be running and will never be invoked. If the
+        callback is currently executing on the timer thread,
+        this call blocks until it completes.
+
+        Safe to call with any id, including ids that have
+        already fired, been cancelled, or were never issued.
+    */
+    void cancel(timer_id id);
+
+protected:
+    void shutdown() override;
+
+private:
+    void stop_and_join();
+    struct entry
+    {
+        std::chrono::steady_clock::time_point deadline;
+        timer_id id;
+        std::function<void()> callback;
+
+        bool operator>(entry const& o) const noexcept
+        {
+            return deadline > o.deadline;
+        }
+    };
+
+    timer_id schedule_at(
+        std::chrono::steady_clock::time_point deadline,
+        std::function<void()> cb);
+
+    void run();
+
+// warning C4251: std types need to have dll-interface
+    BOOST_CAPY_MSVC_WARNING_PUSH
+    BOOST_CAPY_MSVC_WARNING_DISABLE(4251)
+    std::mutex mutex_;
+    std::condition_variable cv_;
+    std::condition_variable cancel_cv_;
+    std::priority_queue<
+        entry,
+        std::vector<entry>,
+        std::greater<>> queue_;
+    std::unordered_set<timer_id> active_ids_;
+    timer_id next_id_ = 0;
+    timer_id executing_id_ = 0;
+    bool stopped_ = false;
+    std::thread thread_;
+    BOOST_CAPY_MSVC_WARNING_POP
+};
+
+} // detail
+} // capy
+} // boost
+
+#endif
diff --git a/include/boost/capy/ex/execution_context.hpp b/include/boost/capy/ex/execution_context.hpp
index acfc7c070..1000dec73 100644
--- a/include/boost/capy/ex/execution_context.hpp
+++ b/include/boost/capy/ex/execution_context.hpp
@@ -163,15 +163,11 @@ class BOOST_CAPY_DECL
         service* next_ = nullptr;
 
 // warning C4251: 'std::type_index' needs to have dll-interface
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4251)
-#endif
+        BOOST_CAPY_MSVC_WARNING_PUSH
+        BOOST_CAPY_MSVC_WARNING_DISABLE(4251)
         detail::type_index t0_{detail::type_id<void>()};
         detail::type_index t1_{detail::type_id<void>()};
-#ifdef _MSC_VER
-# pragma warning(pop)
-#endif
+        BOOST_CAPY_MSVC_WARNING_POP
     };
 
     //------------------------------------------------
@@ -193,7 +189,7 @@ class BOOST_CAPY_DECL
     */
     ~execution_context();
 
-    /** Default constructor.
+    /** Construct a default instance.
 
         @par Exception Safety
         Strong guarantee.
@@ -503,16 +499,12 @@ class BOOST_CAPY_DECL
     struct BOOST_CAPY_DECL
         factory
     {
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4251)
-#endif
 // warning C4251: 'std::type_index' needs to have dll-interface
+        BOOST_CAPY_MSVC_WARNING_PUSH
+        BOOST_CAPY_MSVC_WARNING_DISABLE(4251)
         detail::type_index t0;
         detail::type_index t1;
-#ifdef _MSC_VER
-# pragma warning(pop)
-#endif
+        BOOST_CAPY_MSVC_WARNING_POP
 
         factory(
             detail::type_info const& t0_,
@@ -531,16 +523,12 @@ class BOOST_CAPY_DECL
     service& use_service_impl(factory& f);
     service& make_service_impl(factory& f);
 
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4251)
-#endif
-// warning C4251: 'std::type_index' needs to have dll-interface
+// warning C4251: std::mutex, std::shared_ptr need dll-interface
+    BOOST_CAPY_MSVC_WARNING_PUSH
+    BOOST_CAPY_MSVC_WARNING_DISABLE(4251)
     mutable std::mutex mutex_;
     std::shared_ptr<void> owned_;
-#ifdef _MSC_VER
-# pragma warning(pop)
-#endif
+    BOOST_CAPY_MSVC_WARNING_POP
     std::pmr::memory_resource* frame_alloc_ = nullptr;
     service* head_ = nullptr;
     bool shutdown_ = false;
diff --git a/include/boost/capy/ex/executor_ref.hpp b/include/boost/capy/ex/executor_ref.hpp
index b3c0b644d..f6010b7bd 100644
--- a/include/boost/capy/ex/executor_ref.hpp
+++ b/include/boost/capy/ex/executor_ref.hpp
@@ -12,6 +12,7 @@
 
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/detail/type_id.hpp>
+#include <boost/capy/continuation.hpp>
 #include <concepts>
 #include <coroutine>
 #include <type_traits>
@@ -30,8 +31,8 @@ struct executor_vtable
     execution_context& (*context)(void const*) noexcept;
     void (*on_work_started)(void const*) noexcept;
     void (*on_work_finished)(void const*) noexcept;
-    void (*post)(void const*, std::coroutine_handle<>);
-    std::coroutine_handle<> (*dispatch)(void const*, std::coroutine_handle<>);
+    void (*post)(void const*, continuation&);
+    std::coroutine_handle<> (*dispatch)(void const*, continuation&);
     bool (*equals)(void const*, void const*) noexcept;
     detail::type_info const* type_id;
 };
@@ -52,12 +53,12 @@ inline constexpr executor_vtable vtable_for = {
         const_cast<Ex*>(static_cast<Ex const*>(p))->on_work_finished();
     },
     // post
-    [](void const* p, std::coroutine_handle<> h) {
-        static_cast<Ex const*>(p)->post(h);
+    [](void const* p, continuation& c) {
+        static_cast<Ex const*>(p)->post(c);
     },
     // dispatch
-    [](void const* p, std::coroutine_handle<> h) -> std::coroutine_handle<> {
-        return static_cast<Ex const*>(p)->dispatch(h);
+    [](void const* p, continuation& c) -> std::coroutine_handle<> {
+        return static_cast<Ex const*>(p)->dispatch(c);
     },
     // equals
     [](void const* a, void const* b) noexcept -> bool {
@@ -97,7 +98,7 @@ inline constexpr executor_vtable vtable_for = {
     void store_executor(executor_ref ex)
     {
         if(ex)
-            ex.post(my_coroutine);
+            ex.post(my_continuation);
     }
 
     io_context ctx;
@@ -112,7 +113,7 @@ class executor_ref
     detail::executor_vtable const* vt_ = nullptr;
 
 public:
-    /** Default constructor.
+    /** Construct a default instance.
 
         Constructs an empty `executor_ref`. Calling any executor
         operations on a default-constructed instance results in
@@ -120,7 +121,7 @@ class executor_ref
     */
     executor_ref() = default;
 
-    /** Copy constructor.
+    /** Construct a copy.
 
         Copies the internal pointers, preserving identity.
         This enables the same-executor optimization when passing
@@ -198,36 +199,39 @@ class executor_ref
         vt_->on_work_finished(ex_);
     }
 
-    /** Dispatches a coroutine handle through the wrapped executor.
+    /** Dispatches a continuation through the wrapped executor.
 
         Returns a handle for symmetric transfer. If running in the
-        executor's thread, returns `h`. Otherwise, posts the coroutine
-        for later execution and returns `std::noop_coroutine()`.
+        executor's thread, returns `c.h`. Otherwise, posts the
+        continuation for later execution and returns
+        `std::noop_coroutine()`.
 
-        @param h The coroutine handle to dispatch for resumption.
+        @param c The continuation to dispatch for resumption.
+                 Must remain at a stable address until dequeued.
 
         @return A handle for symmetric transfer or `std::noop_coroutine()`.
 
         @pre This instance was constructed with a valid executor.
     */
-    std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const
+    std::coroutine_handle<> dispatch(continuation& c) const
     {
-        return vt_->dispatch(ex_, h);
+        return vt_->dispatch(ex_, c);
     }
 
-    /** Posts a coroutine handle to the wrapped executor.
+    /** Posts a continuation to the wrapped executor.
 
-        Posts the coroutine handle to the executor for later execution
+        Posts the continuation to the executor for later execution
         and returns. The caller should transfer to `std::noop_coroutine()`
         after calling this.
 
-        @param h The coroutine handle to post for resumption.
+        @param c The continuation to post for resumption.
+                 Must remain at a stable address until dequeued.
 
         @pre This instance was constructed with a valid executor.
     */
-    void post(std::coroutine_handle<> h) const
+    void post(continuation& c) const
     {
-        vt_->post(ex_, h);
+        vt_->post(ex_, c);
     }
 
     /** Compares two executor references for equality.
diff --git a/include/boost/capy/ex/frame_alloc_mixin.hpp b/include/boost/capy/ex/frame_alloc_mixin.hpp
new file mode 100644
index 000000000..7888440b2
--- /dev/null
+++ b/include/boost/capy/ex/frame_alloc_mixin.hpp
@@ -0,0 +1,118 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_EX_FRAME_ALLOC_MIXIN_HPP
+#define BOOST_CAPY_EX_FRAME_ALLOC_MIXIN_HPP
+
+#include <boost/capy/detail/config.hpp>
+#include <boost/capy/ex/frame_allocator.hpp>
+#include <boost/capy/ex/recycling_memory_resource.hpp>
+
+#include <cstddef>
+#include <cstring>
+#include <memory_resource>
+
+namespace boost {
+namespace capy {
+
+/** Mixin that adds frame-allocator-aware allocation to a promise type.
+
+    Inherit from this class in any coroutine promise type to opt into
+    TLS-based frame allocation with the recycling memory resource
+    fast path. The mixin provides `operator new` and `operator delete`
+    that:
+
+    1. Read the thread-local frame allocator set by `run_async` or `run`.
+    2. Bypass virtual dispatch when the allocator is the default
+       recycling memory resource.
+    3. Store the allocator pointer at the end of each frame for
+       correct deallocation even when TLS changes between allocation
+       and deallocation.
+
+    This is the same allocation strategy used by @ref
+    io_awaitable_promise_base. Use this mixin directly when your
+    promise type does not need the full environment and continuation
+    support that `io_awaitable_promise_base` provides.
+
+    @par Example
+    @code
+    struct my_internal_coroutine
+    {
+        struct promise_type : frame_alloc_mixin
+        {
+            my_internal_coroutine get_return_object();
+            std::suspend_always initial_suspend() noexcept;
+            std::suspend_always final_suspend() noexcept;
+            void return_void();
+            void unhandled_exception() noexcept;
+        };
+    };
+    @endcode
+
+    @par Thread Safety
+    The allocation fast path uses thread-local storage and requires
+    no synchronization. The global pool fallback is mutex-protected.
+
+    @see io_awaitable_promise_base, frame_allocator, recycling_memory_resource
+*/
+struct frame_alloc_mixin
+{
+    /** Allocate a coroutine frame.
+
+        Uses the thread-local frame allocator set by run_async.
+        Falls back to default memory resource if not set.
+        Stores the allocator pointer at the end of each frame for
+        correct deallocation even when TLS changes. Uses memcpy
+        to avoid alignment requirements on the trailing pointer.
+        Bypasses virtual dispatch for the recycling allocator.
+    */
+    static void* operator new(std::size_t size)
+    {
+        static auto* const rmr = get_recycling_memory_resource();
+
+        auto* mr = get_current_frame_allocator();
+        if(!mr)
+            mr = std::pmr::get_default_resource();
+
+        auto total = size + sizeof(std::pmr::memory_resource*);
+        void* raw;
+        if(mr == rmr)
+            raw = static_cast<recycling_memory_resource*>(mr)
+                ->allocate_fast(total, alignof(std::max_align_t));
+        else
+            raw = mr->allocate(total, alignof(std::max_align_t));
+        std::memcpy(static_cast<char*>(raw) + size, &mr, sizeof(mr));
+        return raw;
+    }
+
+    /** Deallocate a coroutine frame.
+
+        Reads the allocator pointer stored at the end of the frame
+        to ensure correct deallocation regardless of current TLS.
+        Bypasses virtual dispatch for the recycling allocator.
+    */
+    static void operator delete(void* ptr, std::size_t size) noexcept
+    {
+        static auto* const rmr = get_recycling_memory_resource();
+
+        std::pmr::memory_resource* mr;
+        std::memcpy(&mr, static_cast<char*>(ptr) + size, sizeof(mr));
+        auto total = size + sizeof(std::pmr::memory_resource*);
+        if(mr == rmr)
+            static_cast<recycling_memory_resource*>(mr)
+                ->deallocate_fast(ptr, total, alignof(std::max_align_t));
+        else
+            mr->deallocate(ptr, total, alignof(std::max_align_t));
+    }
+};
+
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/include/boost/capy/ex/frame_allocator.hpp b/include/boost/capy/ex/frame_allocator.hpp
index b912c8190..8dbdb3b2e 100644
--- a/include/boost/capy/ex/frame_allocator.hpp
+++ b/include/boost/capy/ex/frame_allocator.hpp
@@ -12,6 +12,7 @@
 
 #include <boost/capy/detail/config.hpp>
 
+#include <coroutine>
 #include <memory_resource>
 
 /*  Design rationale (pdimov):
@@ -109,6 +110,39 @@ set_current_frame_allocator(
     detail::current_frame_allocator_ref() = mr;
 }
 
+/** Resume a coroutine handle with frame-allocator TLS protection.
+
+    Saves the current thread-local frame allocator before
+    calling `h.resume()`, then restores it after the call
+    returns. This prevents a resumed coroutine's
+    `await_resume` from permanently overwriting the caller's
+    allocator value.
+
+    Between a coroutine's resumption and its next child
+    invocation, arbitrary user code may run. If that code
+    resumes a coroutine from a different chain on this
+    thread, the other coroutine's `await_resume` overwrites
+    TLS with its own allocator. Without save/restore, the
+    original coroutine's next child would allocate from
+    the wrong resource.
+
+    Event loops, strand dispatch loops, and any code that
+    calls `.resume()` on a coroutine handle should use
+    this function instead of calling `.resume()` directly.
+    See the @ref Executor concept documentation for details.
+
+    @param h The coroutine handle to resume.
+
+    @see get_current_frame_allocator, set_current_frame_allocator
+*/
+inline void
+safe_resume(std::coroutine_handle<> h) noexcept
+{
+    auto* saved = get_current_frame_allocator();
+    h.resume();
+    set_current_frame_allocator(saved);
+}
+
 } // namespace capy
 } // namespace boost
 
diff --git a/include/boost/capy/ex/immediate.hpp b/include/boost/capy/ex/immediate.hpp
index 548c76413..b3c7c1fc1 100644
--- a/include/boost/capy/ex/immediate.hpp
+++ b/include/boost/capy/ex/immediate.hpp
@@ -121,8 +121,6 @@ struct immediate
     }
 };
 
-//----------------------------------------------------------
-
 /** Create an immediate awaitable for a successful io_result.
 
     This helper creates an @ref immediate wrapping an @ref io_result
@@ -198,8 +196,6 @@ ready(T1 t1, T2 t2, T3 t3)
     return {{{}, std::move(t1), std::move(t2), std::move(t3)}};
 }
 
-//----------------------------------------------------------
-
 /** Create an immediate awaitable for a failed io_result.
 
     This helper creates an @ref immediate wrapping an @ref io_result
diff --git a/include/boost/capy/ex/io_awaitable_promise_base.hpp b/include/boost/capy/ex/io_awaitable_promise_base.hpp
index e3a4c505f..6a9185373 100644
--- a/include/boost/capy/ex/io_awaitable_promise_base.hpp
+++ b/include/boost/capy/ex/io_awaitable_promise_base.hpp
@@ -11,14 +11,12 @@
 #define BOOST_CAPY_EX_IO_AWAITABLE_PROMISE_BASE_HPP
 
 #include <boost/capy/detail/config.hpp>
+#include <boost/capy/ex/frame_alloc_mixin.hpp>
 #include <boost/capy/ex/frame_allocator.hpp>
 #include <boost/capy/ex/io_env.hpp>
-#include <boost/capy/ex/recycling_memory_resource.hpp>
 #include <boost/capy/ex/this_coro.hpp>
 
 #include <coroutine>
-#include <cstddef>
-#include <cstring>
 #include <memory_resource>
 #include <stop_token>
 #include <type_traits>
@@ -39,7 +37,7 @@ namespace capy {
     3. **Environment access** — Coroutine code can retrieve the environment
        via `co_await this_coro::environment`, or individual fields via
        `co_await this_coro::executor`, `co_await this_coro::stop_token`,
-       and `co_await this_coro::allocator`.
+       and `co_await this_coro::frame_allocator`.
 
     @tparam Derived The derived promise type (CRTP pattern).
 
@@ -65,12 +63,12 @@ namespace capy {
     my_task example()
     {
         auto env = co_await this_coro::environment;
-        // Access env->executor, env->stop_token, env->allocator
+        // Access env->executor, env->stop_token, env->frame_allocator
 
         // Or use fine-grained accessors:
         auto ex = co_await this_coro::executor;
         auto token = co_await this_coro::stop_token;
-        auto* alloc = co_await this_coro::allocator;
+        auto* alloc = co_await this_coro::frame_allocator;
     }
     @endcode
 
@@ -125,68 +123,27 @@ namespace capy {
     thread of execution, so no synchronization is required.
 
     @see this_coro::environment, this_coro::executor,
-         this_coro::stop_token, this_coro::allocator
+         this_coro::stop_token, this_coro::frame_allocator
     @see io_env
     @see IoAwaitable
 */
 template<typename Derived>
 class io_awaitable_promise_base
+    : public frame_alloc_mixin
 {
     io_env const* env_ = nullptr;
     mutable std::coroutine_handle<> cont_{std::noop_coroutine()};
 
 public:
-    /** Allocate a coroutine frame.
-
-        Uses the thread-local frame allocator set by run_async.
-        Falls back to default memory resource if not set.
-        Stores the allocator pointer at the end of each frame for
-        correct deallocation even when TLS changes. Uses memcpy
-        to avoid alignment requirements on the trailing pointer.
-        Bypasses virtual dispatch for the recycling allocator.
-    */
-    static void* operator new(std::size_t size)
-    {
-        static auto* const rmr = get_recycling_memory_resource();
-
-        auto* mr = get_current_frame_allocator();
-        if(!mr)
-            mr = std::pmr::get_default_resource();
-
-        auto total = size + sizeof(std::pmr::memory_resource*);
-        void* raw;
-        if(mr == rmr)
-            raw = static_cast<recycling_memory_resource*>(mr)
-                ->allocate_fast(total, alignof(std::max_align_t));
-        else
-            raw = mr->allocate(total, alignof(std::max_align_t));
-        std::memcpy(static_cast<char*>(raw) + size, &mr, sizeof(mr));
-        return raw;
-    }
-
-    /** Deallocate a coroutine frame.
-
-        Reads the allocator pointer stored at the end of the frame
-        to ensure correct deallocation regardless of current TLS.
-        Bypasses virtual dispatch for the recycling allocator.
-    */
-    static void operator delete(void* ptr, std::size_t size) noexcept
-    {
-        static auto* const rmr = get_recycling_memory_resource();
-
-        std::pmr::memory_resource* mr;
-        std::memcpy(&mr, static_cast<char*>(ptr) + size, sizeof(mr));
-        auto total = size + sizeof(std::pmr::memory_resource*);
-        if(mr == rmr)
-            static_cast<recycling_memory_resource*>(mr)
-                ->deallocate_fast(ptr, total, alignof(std::max_align_t));
-        else
-            mr->deallocate(ptr, total, alignof(std::max_align_t));
-    }
-
     ~io_awaitable_promise_base()
     {
-        // Abnormal teardown: destroy orphaned continuation
+        // Abnormal teardown: destroy an orphaned continuation, e.g.
+        // a run_async trampoline when the task is destroyed before
+        // reaching final_suspend. Callers must not destroy a task
+        // via handle().destroy() while it is being awaited by a
+        // parent coroutine: that puts cont_ under another owner
+        // and would produce a double-destroy from this branch. See
+        // task::handle() / quitter::handle() for the contract.
         if(cont_ != std::noop_coroutine())
             cont_.destroy();
     }
diff --git a/include/boost/capy/ex/io_env.hpp b/include/boost/capy/ex/io_env.hpp
index 085b55082..fdf49ae01 100644
--- a/include/boost/capy/ex/io_env.hpp
+++ b/include/boost/capy/ex/io_env.hpp
@@ -13,6 +13,7 @@
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/ex/executor_ref.hpp>
 
+#include <coroutine>
 #include <memory_resource>
 #include <stop_token>
 
@@ -52,6 +53,7 @@ struct io_env
         When null, the default allocator is used.
     */
     std::pmr::memory_resource* frame_allocator = nullptr;
+
 };
 
 } // capy
diff --git a/include/boost/capy/ex/recycling_memory_resource.hpp b/include/boost/capy/ex/recycling_memory_resource.hpp
index 604c538b8..5b9c3ed2f 100644
--- a/include/boost/capy/ex/recycling_memory_resource.hpp
+++ b/include/boost/capy/ex/recycling_memory_resource.hpp
@@ -46,10 +46,8 @@ namespace capy {
     @see get_recycling_memory_resource
     @see run_async
 */
-#ifdef _MSC_VER
-# pragma warning(push)
-# pragma warning(disable: 4275) // non dll-interface base class
-#endif
+BOOST_CAPY_MSVC_WARNING_PUSH
+BOOST_CAPY_MSVC_WARNING_DISABLE(4275) // non dll-interface base class
 class BOOST_CAPY_DECL recycling_memory_resource : public std::pmr::memory_resource
 {
     static constexpr std::size_t num_classes = 6;
@@ -184,9 +182,7 @@ class BOOST_CAPY_DECL recycling_memory_resource : public std::pmr::memory_resour
         return this == &other;
     }
 };
-#ifdef _MSC_VER
-# pragma warning(pop)
-#endif
+BOOST_CAPY_MSVC_WARNING_POP
 
 /** Returns pointer to the default recycling memory resource.
 
diff --git a/include/boost/capy/ex/run.hpp b/include/boost/capy/ex/run.hpp
index afb74ee7a..d0699ac14 100644
--- a/include/boost/capy/ex/run.hpp
+++ b/include/boost/capy/ex/run.hpp
@@ -11,11 +11,13 @@
 #define BOOST_CAPY_RUN_HPP
 
 #include <boost/capy/detail/config.hpp>
+#include <boost/capy/detail/await_suspend_helper.hpp>
 #include <boost/capy/detail/run.hpp>
 #include <boost/capy/concept/executor.hpp>
 #include <boost/capy/concept/io_runnable.hpp>
 #include <boost/capy/ex/executor_ref.hpp>
 #include <coroutine>
+#include <boost/capy/ex/frame_alloc_mixin.hpp>
 #include <boost/capy/ex/frame_allocator.hpp>
 #include <boost/capy/ex/io_env.hpp>
 
@@ -58,12 +60,6 @@
 
 namespace boost::capy::detail {
 
-//----------------------------------------------------------
-//
-// dispatch_trampoline - cross-executor dispatch
-//
-//----------------------------------------------------------
-
 /** Minimal coroutine that dispatches through the caller's executor.
 
     Sits between the inner task and the parent when executors
@@ -74,16 +70,17 @@ namespace boost::capy::detail {
 
     The trampoline never touches the task's result.
 */
-struct dispatch_trampoline
+struct BOOST_CAPY_CORO_DESTROY_WHEN_COMPLETE boundary_trampoline
 {
     struct promise_type
+        : frame_alloc_mixin
     {
         executor_ref caller_ex_;
-        std::coroutine_handle<> parent_;
+        continuation parent_;
 
-        dispatch_trampoline get_return_object() noexcept
+        boundary_trampoline get_return_object() noexcept
         {
-            return dispatch_trampoline{
+            return boundary_trampoline{
                 std::coroutine_handle<promise_type>::from_promise(*this)};
         }
 
@@ -96,10 +93,12 @@ struct dispatch_trampoline
                 promise_type* p_;
                 bool await_ready() const noexcept { return false; }
 
-                std::coroutine_handle<> await_suspend(
+                auto await_suspend(
                     std::coroutine_handle<>) noexcept
                 {
-                    return p_->caller_ex_.dispatch(p_->parent_);
+                    p_->caller_ex_.post(p_->parent_);
+                    return detail::symmetric_transfer(
+                        std::noop_coroutine());
                 }
 
                 void await_resume() const noexcept {}
@@ -113,20 +112,20 @@ struct dispatch_trampoline
 
     std::coroutine_handle<promise_type> h_{nullptr};
 
-    dispatch_trampoline() noexcept = default;
+    boundary_trampoline() noexcept = default;
 
-    ~dispatch_trampoline()
+    ~boundary_trampoline()
     {
         if(h_) h_.destroy();
     }
 
-    dispatch_trampoline(dispatch_trampoline const&) = delete;
-    dispatch_trampoline& operator=(dispatch_trampoline const&) = delete;
+    boundary_trampoline(boundary_trampoline const&) = delete;
+    boundary_trampoline& operator=(boundary_trampoline const&) = delete;
 
-    dispatch_trampoline(dispatch_trampoline&& o) noexcept
+    boundary_trampoline(boundary_trampoline&& o) noexcept
         : h_(std::exchange(o.h_, nullptr)) {}
 
-    dispatch_trampoline& operator=(dispatch_trampoline&& o) noexcept
+    boundary_trampoline& operator=(boundary_trampoline&& o) noexcept
     {
         if(this != &o)
         {
@@ -137,21 +136,15 @@ struct dispatch_trampoline
     }
 
 private:
-    explicit dispatch_trampoline(std::coroutine_handle<promise_type> h) noexcept
+    explicit boundary_trampoline(std::coroutine_handle<promise_type> h) noexcept
         : h_(h) {}
 };
 
-inline dispatch_trampoline make_dispatch_trampoline()
+inline boundary_trampoline make_boundary_trampoline()
 {
     co_return;
 }
 
-//----------------------------------------------------------
-//
-// run_awaitable_ex - with executor (executor switch)
-//
-//----------------------------------------------------------
-
 /** Awaitable that binds an IoRunnable to a specific executor.
 
     Stores the executor and inner task by value. When co_awaited, the
@@ -178,7 +171,8 @@ struct [[nodiscard]] run_awaitable_ex
     frame_memory_resource<Alloc> resource_;
     std::conditional_t<InheritStopToken, std::monostate, std::stop_token> st_;
     io_env env_;
-    dispatch_trampoline tr_;
+    boundary_trampoline tr_;
+    continuation task_cont_;
     Task inner_;  // Last: destroyed first, while env_ is still valid
 
     // void allocator, inherit stop token
@@ -231,9 +225,9 @@ struct [[nodiscard]] run_awaitable_ex
 
     std::coroutine_handle<> await_suspend(std::coroutine_handle<> cont, io_env const* caller_env)
     {
-        tr_ = make_dispatch_trampoline();
+        tr_ = make_boundary_trampoline();
         tr_.h_.promise().caller_ex_ = caller_env->executor;
-        tr_.h_.promise().parent_ = cont;
+        tr_.h_.promise().parent_.h = cont;
 
         auto h = inner_.handle();
         auto& p = h.promise();
@@ -251,7 +245,9 @@ struct [[nodiscard]] run_awaitable_ex
             env_.frame_allocator = caller_env->frame_allocator;
 
         p.set_environment(&env_);
-        return h;
+        task_cont_.h = h;
+        ex_.post(task_cont_);
+        return std::noop_coroutine();
     }
 
     // Non-copyable
@@ -263,12 +259,6 @@ struct [[nodiscard]] run_awaitable_ex
     run_awaitable_ex& operator=(run_awaitable_ex&&) = default;
 };
 
-//----------------------------------------------------------
-//
-// run_awaitable - no executor (inherits caller's executor)
-//
-//----------------------------------------------------------
-
 /** Awaitable that runs a task with optional stop_token override.
 
     Does NOT store an executor - the task inherits the caller's executor
@@ -362,12 +352,6 @@ struct [[nodiscard]] run_awaitable
     run_awaitable& operator=(run_awaitable&&) = default;
 };
 
-//----------------------------------------------------------
-//
-// run_wrapper_ex - with executor
-//
-//----------------------------------------------------------
-
 /** Wrapper returned by run(ex, ...) that accepts a task for execution.
 
     @tparam Ex The executor type.
@@ -503,12 +487,6 @@ class [[nodiscard]] run_wrapper_ex<Ex, InheritStopToken, void>
     }
 };
 
-//----------------------------------------------------------
-//
-// run_wrapper - no executor (inherits caller's executor)
-//
-//----------------------------------------------------------
-
 /** Wrapper returned by run(st) or run(alloc) that accepts a task.
 
     @tparam InheritStopToken If true, inherit caller's stop token.
@@ -627,12 +605,6 @@ class [[nodiscard]] run_wrapper<false, void>
 
 namespace boost::capy {
 
-//----------------------------------------------------------
-//
-// run() overloads - with executor
-//
-//----------------------------------------------------------
-
 /** Bind a task to execute on a specific executor.
 
     Returns a wrapper that accepts a task and produces an awaitable.
@@ -734,12 +706,6 @@ run(Ex ex, std::stop_token st, Alloc alloc)
         std::move(ex), std::move(st), std::move(alloc)};
 }
 
-//----------------------------------------------------------
-//
-// run() overloads - no executor (inherits caller's)
-//
-//----------------------------------------------------------
-
 /** Run a task with a custom stop token.
 
     The task inherits the caller's executor. Only the stop token
diff --git a/include/boost/capy/ex/run_async.hpp b/include/boost/capy/ex/run_async.hpp
index ad86621a0..6f1b061c3 100644
--- a/include/boost/capy/ex/run_async.hpp
+++ b/include/boost/capy/ex/run_async.hpp
@@ -21,6 +21,7 @@
 #include <boost/capy/ex/recycling_memory_resource.hpp>
 #include <boost/capy/ex/work_guard.hpp>
 
+#include <algorithm>
 #include <coroutine>
 #include <cstring>
 #include <memory_resource>
@@ -81,7 +82,7 @@ struct get_promise_awaiter
     @tparam Alloc The allocator type (value type or memory_resource*).
 */
 template<class Ex, class Handlers, class Alloc>
-struct run_async_trampoline
+struct BOOST_CAPY_CORO_DESTROY_WHEN_COMPLETE run_async_trampoline
 {
     using invoke_fn = void(*)(void*, Handlers&);
 
@@ -93,7 +94,11 @@ struct run_async_trampoline
         io_env env_;
         invoke_fn invoke_ = nullptr;
         void* task_promise_ = nullptr;
+        // task_h_: raw handle for frame_guard cleanup in make_trampoline.
+        // task_cont_: continuation wrapping the same handle for executor dispatch.
+        // Both must reference the same coroutine and be kept in sync.
         std::coroutine_handle<> task_h_;
+        continuation task_cont_;
 
         promise_type(Ex& ex, Handlers& h, Alloc& a) noexcept
             : wg_(std::move(ex))
@@ -188,7 +193,8 @@ struct run_async_trampoline
     This avoids double indirection when the user passes a memory_resource*.
 */
 template<class Ex, class Handlers>
-struct run_async_trampoline<Ex, Handlers, std::pmr::memory_resource*>
+struct BOOST_CAPY_CORO_DESTROY_WHEN_COMPLETE
+    run_async_trampoline<Ex, Handlers, std::pmr::memory_resource*>
 {
     using invoke_fn = void(*)(void*, Handlers&);
 
@@ -200,7 +206,11 @@ struct run_async_trampoline<Ex, Handlers, std::pmr::memory_resource*>
         io_env env_;
         invoke_fn invoke_ = nullptr;
         void* task_promise_ = nullptr;
+        // task_h_: raw handle for frame_guard cleanup in make_trampoline.
+        // task_cont_: continuation wrapping the same handle for executor dispatch.
+        // Both must reference the same coroutine and be kept in sync.
         std::coroutine_handle<> task_h_;
+        continuation task_cont_;
 
         promise_type(
             Ex& ex, Handlers& h, std::pmr::memory_resource* mr) noexcept
@@ -282,19 +292,20 @@ make_trampoline(Ex, Handlers, Alloc)
     // promise_type ctor steals the parameters
     auto& p = co_await get_promise_awaiter<
         typename run_async_trampoline<Ex, Handlers, Alloc>::promise_type>{};
-    
+
+    // Guard ensures the task frame is destroyed even when invoke_
+    // throws (e.g. default_handler rethrows an unhandled exception).
+    struct frame_guard
+    {
+        std::coroutine_handle<>& h;
+        ~frame_guard() { h.destroy(); }
+    } guard{p.task_h_};
+
     p.invoke_(p.task_promise_, p.handlers_);
-    p.task_h_.destroy();
 }
 
 } // namespace detail
 
-//----------------------------------------------------------
-//
-// run_async_wrapper
-//
-//----------------------------------------------------------
-
 /** Wrapper returned by run_async that accepts a task for execution.
 
     This wrapper holds the run_async_trampoline coroutine, executor, stop token,
@@ -401,17 +412,14 @@ class [[nodiscard]] run_async_wrapper
         p.env_ = {p.wg_.executor(), st_, p.get_resource()};
         task_promise.set_environment(&p.env_);
 
-        // Start task through executor
-        p.wg_.executor().dispatch(task_h).resume();
+        // Start task through executor.
+        // safe_resume is not needed here: TLS is already saved in the
+        // constructor (saved_tls_) and restored in the destructor.
+        p.task_cont_.h = task_h;
+        p.wg_.executor().dispatch(p.task_cont_).resume();
     }
 };
 
-//----------------------------------------------------------
-//
-// run_async Overloads
-//
-//----------------------------------------------------------
-
 // Executor only (uses default recycling allocator)
 
 /** Asynchronously launch a lazy task on the given executor.
diff --git a/include/boost/capy/ex/strand.hpp b/include/boost/capy/ex/strand.hpp
index 019c2bef8..f607610d3 100644
--- a/include/boost/capy/ex/strand.hpp
+++ b/include/boost/capy/ex/strand.hpp
@@ -11,6 +11,7 @@
 #define BOOST_CAPY_EX_STRAND_HPP
 
 #include <boost/capy/detail/config.hpp>
+#include <boost/capy/continuation.hpp>
 #include <coroutine>
 #include <boost/capy/ex/detail/strand_service.hpp>
 
@@ -19,8 +20,6 @@
 namespace boost {
 namespace capy {
 
-//----------------------------------------------------------
-
 /** Provides serialized coroutine execution for any executor type.
 
     A strand wraps an inner executor and ensures that coroutines
@@ -37,18 +36,19 @@ namespace capy {
     Coroutines resumed through a strand shall not run concurrently.
 
     @par Implementation
-    The strand uses a service-based architecture with a fixed pool
-    of 211 implementation objects. New strands hash to select an
-    impl from the pool. Strands that hash to the same index share
-    serialization, which is harmless (just extra serialization)
-    and rare with 211 buckets.
+    Each strand allocates a private serialization state. Strands
+    constructed from the same execution context share a small pool
+    of mutexes (193 entries) selected by hash; mutex sharing causes
+    only brief contention on the push/pop critical section, never
+    cross-strand state sharing. Construction cost: one
+    `std::make_shared` per strand.
 
     @par Executor Concept
     This class satisfies the `Executor` concept, providing:
     - `context()` - Returns the underlying execution context
     - `on_work_started()` / `on_work_finished()` - Work tracking
-    - `dispatch(h)` - May run immediately if strand is idle
-    - `post(h)` - Always queues for later execution
+    - `dispatch(continuation&)` - May run immediately if strand is idle
+    - `post(continuation&)` - Always queues for later execution
 
     @par Thread Safety
     Distinct objects: Safe.
@@ -59,10 +59,13 @@ namespace capy {
     thread_pool pool(4);
     auto strand = make_strand(pool.get_executor());
 
-    // These coroutines will never run concurrently
-    strand.post(coro1);
-    strand.post(coro2);
-    strand.post(coro3);
+    // Continuations are linked intrusively into the strand's queue,
+    // so each one must outlive its time there. Storage is typically
+    // owned by the awaitable or operation state that posted it.
+    continuation c1{h1}, c2{h2}, c3{h3};
+    strand.post(c1);
+    strand.post(c2);
+    strand.post(c3);
     @endcode
 
     @tparam E The type of the underlying executor. Must
@@ -73,9 +76,11 @@ namespace capy {
 template<typename Ex>
 class strand
 {
-    detail::strand_impl* impl_;
+    std::shared_ptr<detail::strand_impl> impl_;
     Ex ex_;
 
+    friend struct strand_test;
+
 public:
     /** The type of the underlying executor.
     */
@@ -83,9 +88,8 @@ class strand
 
     /** Construct a strand for the specified executor.
 
-        Obtains a strand implementation from the service associated
-        with the executor's context. The implementation is selected
-        from a fixed pool using a hash function.
+        Allocates a fresh strand implementation from the service
+        associated with the executor's context.
 
         @param ex The inner executor to wrap. Coroutines will
             ultimately be dispatched through this executor.
@@ -101,12 +105,12 @@ class strand
     explicit
     strand(Ex1&& ex)
         : impl_(detail::get_strand_service(ex.context())
-            .get_implementation())
+            .create_implementation())
         , ex_(std::forward<Ex1>(ex))
     {
     }
 
-    /** Copy constructor.
+    /** Construct a copy.
 
         Creates a strand that shares serialization state with
         the original. Coroutines dispatched through either strand
@@ -114,15 +118,21 @@ class strand
     */
     strand(strand const&) = default;
 
-    /** Move constructor.
+    /** Construct by moving.
+
+        @note A moved-from strand is only safe to destroy
+            or reassign.
     */
     strand(strand&&) = default;
 
-    /** Copy assignment operator.
+    /** Assign by copying.
     */
     strand& operator=(strand const&) = default;
 
-    /** Move assignment operator.
+    /** Assign by moving.
+
+        @note A moved-from strand is only safe to destroy
+            or reassign.
     */
     strand& operator=(strand&&) = default;
 
@@ -192,46 +202,50 @@ class strand
     bool
     operator==(strand const& other) const noexcept
     {
-        return impl_ == other.impl_;
+        return impl_.get() == other.impl_.get();
     }
 
-    /** Post a coroutine to the strand.
+    /** Post a continuation to the strand.
 
-        The coroutine is always queued for execution, never resumed
+        The continuation is always queued for execution, never resumed
         immediately. When the strand becomes available, queued
-        coroutines execute in FIFO order on the underlying executor.
+        work executes in FIFO order on the underlying executor.
 
         @par Ordering
         Guarantees strict FIFO ordering relative to other post() calls.
         Use this instead of dispatch() when ordering matters.
 
-        @param h The coroutine handle to post.
+        @param c The continuation to post. The caller retains
+            ownership; the continuation must remain valid until
+            it is dequeued and resumed.
     */
     void
-    post(std::coroutine_handle<> h) const
+    post(continuation& c) const
     {
-        detail::strand_service::post(*impl_, executor_ref(ex_), h);
+        detail::strand_service::post(impl_, executor_ref(ex_), c);
     }
 
-    /** Dispatch a coroutine through the strand.
+    /** Dispatch a continuation through the strand.
 
         Returns a handle for symmetric transfer. If the calling
-        thread is already executing within this strand, returns `h`.
-        Otherwise, the coroutine is queued and
+        thread is already executing within this strand, returns `c.h`.
+        Otherwise, the continuation is queued and
         `std::noop_coroutine()` is returned.
 
         @par Ordering
         Callers requiring strict FIFO ordering should use post()
-        instead, which always queues the coroutine.
+        instead, which always queues the continuation.
 
-        @param h The coroutine handle to dispatch.
+        @param c The continuation to dispatch. The caller retains
+            ownership; the continuation must remain valid until
+            it is dequeued and resumed.
 
         @return A handle for symmetric transfer or `std::noop_coroutine()`.
     */
     std::coroutine_handle<>
-    dispatch(std::coroutine_handle<> h) const
+    dispatch(continuation& c) const
     {
-        return detail::strand_service::dispatch(*impl_, executor_ref(ex_), h);
+        return detail::strand_service::dispatch(impl_, executor_ref(ex_), c);
     }
 };
 
diff --git a/include/boost/capy/ex/thread_pool.hpp b/include/boost/capy/ex/thread_pool.hpp
index e0126cc07..29097ec60 100644
--- a/include/boost/capy/ex/thread_pool.hpp
+++ b/include/boost/capy/ex/thread_pool.hpp
@@ -5,13 +5,14 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 //
-// Official repository: https://github.com/boostorg/capy
+// Official repository: https://github.com/cppalliance/capy
 //
 
 #ifndef BOOST_CAPY_EX_THREAD_POOL_HPP
 #define BOOST_CAPY_EX_THREAD_POOL_HPP
 
 #include <boost/capy/detail/config.hpp>
+#include <boost/capy/continuation.hpp>
 #include <coroutine>
 #include <boost/capy/ex/execution_context.hpp>
 #include <cstddef>
@@ -78,11 +79,47 @@ class BOOST_CAPY_DECL
     thread_pool(thread_pool const&) = delete;
     thread_pool& operator=(thread_pool const&) = delete;
 
+    /** Wait for all outstanding work to complete.
+
+        Releases the internal work guard, then blocks the calling
+        thread until all outstanding work tracked by
+        @ref executor_type::on_work_started and
+        @ref executor_type::on_work_finished completes. After all
+        work finishes, joins the worker threads.
+
+        If @ref stop is called while `join()` is blocking, the
+        pool stops without waiting for remaining work to
+        complete. Worker threads finish their current item and
+        exit; `join()` still waits for all threads to be joined
+        before returning.
+
+        This function is idempotent. The first call performs the
+        join; subsequent calls return immediately.
+
+        @par Preconditions
+        Must not be called from a thread in this pool (undefined
+        behavior).
+
+        @par Postconditions
+        All worker threads have been joined. The pool cannot be
+        reused.
+
+        @par Thread Safety
+        May be called from any thread not in this pool.
+    */
+    void
+    join() noexcept;
+
     /** Request all worker threads to stop.
 
-        Signals all threads to exit. Threads will finish their
-        current work item before exiting. Does not wait for
-        threads to exit.
+        Signals all threads to exit after finishing their current
+        work item. Queued work that has not started is abandoned.
+        Does not wait for threads to exit.
+
+        If @ref join is blocking on another thread, calling
+        `stop()` causes it to stop waiting for outstanding
+        work. The `join()` call still waits for worker threads
+        to finish their current item and exit before returning.
     */
     void
     stop() noexcept;
@@ -95,8 +132,6 @@ class BOOST_CAPY_DECL
     get_executor() const noexcept;
 };
 
-//------------------------------------------------------------------------------
-
 /** An executor that submits work to a thread_pool.
 
     Executors are lightweight handles that can be copied and stored.
@@ -119,7 +154,12 @@ class thread_pool::executor_type
     }
 
 public:
-    /// Default construct a null executor.
+    /** Construct a default null executor.
+
+        The resulting executor is not associated with any pool.
+        `context()`, `dispatch()`, and `post()` require the
+        executor to be associated with a pool before use.
+    */
     executor_type() = default;
 
     /// Return the underlying thread pool.
@@ -129,46 +169,61 @@ class thread_pool::executor_type
         return *pool_;
     }
 
-    /// Notify that work has started (no-op for thread pools).
+    /** Notify that work has started.
+
+        Increments the outstanding work count. Must be paired
+        with a subsequent call to @ref on_work_finished.
+
+        @see on_work_finished, work_guard
+    */
+    BOOST_CAPY_DECL
     void
-    on_work_started() const noexcept
-    {
-    }
+    on_work_started() const noexcept;
+
+    /** Notify that work has finished.
+
+        Decrements the outstanding work count. When the count
+        reaches zero after @ref thread_pool::join has been called,
+        the pool's worker threads are signaled to stop.
 
-    /// Notify that work has finished (no-op for thread pools).
+        @pre A preceding call to @ref on_work_started was made.
+
+        @see on_work_started, work_guard
+    */
+    BOOST_CAPY_DECL
     void
-    on_work_finished() const noexcept
-    {
-    }
+    on_work_finished() const noexcept;
 
-    /** Dispatch a coroutine for execution.
+    /** Dispatch a continuation for execution.
 
-        Posts the coroutine to the thread pool for execution on a
-        worker thread and returns `std::noop_coroutine()`. Thread
-        pools never execute inline because no single thread "owns"
-        the pool.
+        If the calling thread is a worker of this pool, returns
+        `c.h` for symmetric transfer so the caller can resume the
+        continuation inline. Otherwise, posts the continuation to
+        the pool for execution on a worker thread and returns
+        `std::noop_coroutine()`.
 
-        @param h The coroutine handle to execute.
+        @param c The continuation to execute. On the post path,
+                 must remain at a stable address until dequeued
+                 and resumed.
 
-        @return `std::noop_coroutine()` always.
+        @return `c.h` when the calling thread is a pool worker;
+                `std::noop_coroutine()` otherwise.
     */
+    BOOST_CAPY_DECL
     std::coroutine_handle<>
-    dispatch(std::coroutine_handle<> h) const
-    {
-        post(h);
-        return std::noop_coroutine();
-    }
+    dispatch(continuation& c) const;
 
-    /** Post a coroutine to the thread pool.
+    /** Post a continuation to the thread pool.
 
-        The coroutine will be resumed on one of the pool's
-        worker threads.
+        The continuation will be resumed on one of the pool's
+        worker threads. The continuation must remain at a stable
+        address until it is dequeued and resumed.
 
-        @param h The coroutine handle to execute.
+        @param c The continuation to execute.
     */
     BOOST_CAPY_DECL
     void
-    post(std::coroutine_handle<> h) const;
+    post(continuation& c) const;
 
     /// Return true if two executors refer to the same thread pool.
     bool
@@ -178,17 +233,6 @@ class thread_pool::executor_type
     }
 };
 
-//------------------------------------------------------------------------------
-
-inline
-auto
-thread_pool::
-get_executor() const noexcept ->
-    executor_type
-{
-    return executor_type(const_cast<thread_pool&>(*this));
-}
-
 } // capy
 } // boost
 
diff --git a/include/boost/capy/ex/work_guard.hpp b/include/boost/capy/ex/work_guard.hpp
index 67b3078d9..5a8b50939 100644
--- a/include/boost/capy/ex/work_guard.hpp
+++ b/include/boost/capy/ex/work_guard.hpp
@@ -101,7 +101,7 @@ class work_guard
         ex_.on_work_started();
     }
 
-    /** Copy constructor.
+    /** Construct a copy.
 
         Creates a new work guard holding work on the same executor.
         Calls `on_work_started()` on the executor.
@@ -123,7 +123,7 @@ class work_guard
             ex_.on_work_started();
     }
 
-    /** Move constructor.
+    /** Construct by moving.
 
         Transfers work ownership from `other` to `*this`. Does not
         call `on_work_started()` or `on_work_finished()`.
@@ -213,8 +213,6 @@ class work_guard
     }
 };
 
-//------------------------------------------------
-
 /** Create a work guard from an executor.
 
     @par Exception Safety
diff --git a/include/boost/capy/io/any_buffer_sink.hpp b/include/boost/capy/io/any_buffer_sink.hpp
index 5b120c8c6..dc92cf4fe 100644
--- a/include/boost/capy/io/any_buffer_sink.hpp
+++ b/include/boost/capy/io/any_buffer_sink.hpp
@@ -136,7 +136,7 @@ class any_buffer_sink
     */
     ~any_buffer_sink();
 
-    /** Default constructor.
+    /** Construct a default instance.
 
         Constructs an empty wrapper. Operations on a default-constructed
         wrapper result in undefined behavior.
@@ -150,7 +150,7 @@ class any_buffer_sink
     any_buffer_sink(any_buffer_sink const&) = delete;
     any_buffer_sink& operator=(any_buffer_sink const&) = delete;
 
-    /** Move constructor.
+    /** Construct by moving.
 
         Transfers ownership of the wrapped sink (if owned) and
         cached awaitable storage from `other`. After the move, `other` is
@@ -168,7 +168,7 @@ class any_buffer_sink
     {
     }
 
-    /** Move assignment operator.
+    /** Assign by moving.
 
         Destroys any owned sink and releases existing resources,
         then transfers ownership from `other`.
@@ -250,7 +250,7 @@ class any_buffer_sink
 
         @param n The number of bytes to commit.
 
-        @return An awaitable yielding `(error_code)`.
+        @return An awaitable that await-returns `(error_code)`.
 
         @par Preconditions
         The wrapper must contain a valid sink (`has_value() == true`).
@@ -266,7 +266,7 @@ class any_buffer_sink
 
         @param n The number of bytes to commit.
 
-        @return An awaitable yielding `(error_code)`.
+        @return An awaitable that await-returns `(error_code)`.
 
         @par Preconditions
         The wrapper must contain a valid sink (`has_value() == true`).
@@ -276,8 +276,9 @@ class any_buffer_sink
 
     /** Write some data from a buffer sequence.
 
-        Writes one or more bytes from the buffer sequence to the
-        underlying sink. May consume less than the full sequence.
+        Attempt to write up to `buffer_size( buffers )` bytes from
+        the buffer sequence to the underlying sink. May consume less
+        than the full sequence.
 
         When the wrapped type provides native @ref WriteSink support,
         the operation forwards directly. Otherwise it is synthesized
@@ -285,7 +286,7 @@ class any_buffer_sink
 
         @param buffers The buffer sequence to write.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @par Preconditions
         The wrapper must contain a valid sink (`has_value() == true`).
@@ -305,7 +306,7 @@ class any_buffer_sink
 
         @param buffers The buffer sequence to write.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @par Preconditions
         The wrapper must contain a valid sink (`has_value() == true`).
@@ -326,7 +327,7 @@ class any_buffer_sink
 
         @param buffers The buffer sequence to write.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @par Preconditions
         The wrapper must contain a valid sink (`has_value() == true`).
@@ -344,7 +345,7 @@ class any_buffer_sink
         the underlying `write_eof()` is called. Otherwise the
         operation is implemented as `commit_eof(0)`.
 
-        @return An awaitable yielding `(error_code)`.
+        @return An awaitable that await-returns `(error_code)`.
 
         @par Preconditions
         The wrapper must contain a valid sink (`has_value() == true`).
@@ -400,9 +401,7 @@ class any_buffer_sink
     write_eof_buffers_(std::span<const_buffer const> buffers);
 };
 
-//----------------------------------------------------------
-
-/** Type-erased ops for awaitables yielding `io_result<>`. */
+/** Type-erased ops for awaitables that await-return `io_result<>`. */
 struct any_buffer_sink::awaitable_ops
 {
     bool (*await_ready)(void*);
@@ -411,7 +410,7 @@ struct any_buffer_sink::awaitable_ops
     void (*destroy)(void*) noexcept;
 };
 
-/** Type-erased ops for awaitables yielding `io_result<std::size_t>`. */
+/** Type-erased ops for awaitables that await-return `io_result<std::size_t>`. */
 struct any_buffer_sink::write_awaitable_ops
 {
     bool (*await_ready)(void*);
@@ -532,9 +531,6 @@ struct any_buffer_sink::vtable_for_impl
         return &ops;
     }
 
-    //------------------------------------------------------
-    // WriteSink forwarding (only instantiated when WriteSink<S>)
-
     static write_awaitable_ops const*
     construct_write_some_awaitable_impl(
         void* sink,
@@ -653,8 +649,6 @@ struct any_buffer_sink::vtable_for_impl
         return &ops;
     }
 
-    //------------------------------------------------------
-
     static consteval std::size_t
     compute_max_size() noexcept
     {
@@ -735,8 +729,6 @@ struct any_buffer_sink::vtable_for_impl
     static constexpr vtable value = make_vtable();
 };
 
-//----------------------------------------------------------
-
 inline
 any_buffer_sink::~any_buffer_sink()
 {
@@ -805,8 +797,6 @@ any_buffer_sink::any_buffer_sink(S* s)
     cached_awaitable_ = ::operator new(vt_->awaitable_size);
 }
 
-//----------------------------------------------------------
-
 inline std::span<mutable_buffer>
 any_buffer_sink::prepare(std::span<mutable_buffer> dest)
 {
@@ -897,9 +887,6 @@ any_buffer_sink::commit_eof(std::size_t n)
     return awaitable{this, n};
 }
 
-//----------------------------------------------------------
-// Private helpers for native WriteSink forwarding
-
 inline auto
 any_buffer_sink::write_some_(
     std::span<const_buffer const> buffers)
@@ -1050,9 +1037,6 @@ any_buffer_sink::write_eof_buffers_(
     return awaitable{this, buffers};
 }
 
-//----------------------------------------------------------
-// Public WriteSink methods
-
 template<ConstBufferSequence CB>
 io_task<std::size_t>
 any_buffer_sink::write_some(CB buffers)
@@ -1264,8 +1248,6 @@ any_buffer_sink::write_eof(CB buffers)
     co_return {{}, total};
 }
 
-//----------------------------------------------------------
-
 static_assert(BufferSink<any_buffer_sink>);
 static_assert(WriteSink<any_buffer_sink>);
 
diff --git a/include/boost/capy/io/any_buffer_source.hpp b/include/boost/capy/io/any_buffer_source.hpp
index 387aba7ed..bdce6e542 100644
--- a/include/boost/capy/io/any_buffer_source.hpp
+++ b/include/boost/capy/io/any_buffer_source.hpp
@@ -15,7 +15,6 @@
 #include <boost/capy/buffers.hpp>
 #include <boost/capy/buffers/buffer_copy.hpp>
 #include <boost/capy/buffers/buffer_param.hpp>
-#include <boost/capy/buffers/slice.hpp>
 #include <boost/capy/concept/buffer_source.hpp>
 #include <boost/capy/concept/io_awaitable.hpp>
 #include <boost/capy/concept/read_source.hpp>
@@ -120,7 +119,7 @@ class any_buffer_source
     */
     ~any_buffer_source();
 
-    /** Default constructor.
+    /** Construct a default instance.
 
         Constructs an empty wrapper. Operations on a default-constructed
         wrapper result in undefined behavior.
@@ -134,7 +133,7 @@ class any_buffer_source
     any_buffer_source(any_buffer_source const&) = delete;
     any_buffer_source& operator=(any_buffer_source const&) = delete;
 
-    /** Move constructor.
+    /** Construct by moving.
 
         Transfers ownership of the wrapped source (if owned) and
         cached awaitable storage from `other`. After the move, `other` is
@@ -152,7 +151,7 @@ class any_buffer_source
     {
     }
 
-    /** Move assignment operator.
+    /** Assign by moving.
 
         Destroys any owned source and releases existing resources,
         then transfers ownership from `other`.
@@ -233,7 +232,7 @@ class any_buffer_source
 
         @param dest Span of const_buffer to fill.
 
-        @return An awaitable yielding `(error_code,std::span<const_buffer>)`.
+        @return An awaitable that await-returns `(error_code,std::span<const_buffer>)`.
             On success with data, a non-empty span of filled buffers.
             On EOF, `ec == cond::eof` and span is empty.
 
@@ -247,8 +246,8 @@ class any_buffer_source
 
     /** Read some data into a mutable buffer sequence.
 
-        Reads one or more bytes into the caller's buffers. May fill
-        less than the full sequence.
+        Attempt to read up to `buffer_size( buffers )` bytes into
+        the caller's buffers. May fill less than the full sequence.
 
         When the wrapped type provides native @ref ReadSource support,
         the operation forwards directly. Otherwise it is synthesized
@@ -256,7 +255,7 @@ class any_buffer_source
 
         @param buffers The buffer sequence to fill.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @par Preconditions
         The wrapper must contain a valid source (`has_value() == true`).
@@ -278,7 +277,7 @@ class any_buffer_source
 
         @param buffers The buffer sequence to fill.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
             On success, `n == buffer_size(buffers)`.
             On EOF, `ec == error::eof` and `n` is bytes transferred.
 
@@ -333,9 +332,7 @@ class any_buffer_source
     read_(std::span<mutable_buffer const> buffers);
 };
 
-//----------------------------------------------------------
-
-/** Type-erased ops for awaitables yielding `io_result<std::span<const_buffer>>`. */
+/** Type-erased ops for awaitables that await-return `io_result<std::span<const_buffer>>`. */
 struct any_buffer_source::awaitable_ops
 {
     bool (*await_ready)(void*);
@@ -344,7 +341,7 @@ struct any_buffer_source::awaitable_ops
     void (*destroy)(void*) noexcept;
 };
 
-/** Type-erased ops for awaitables yielding `io_result<std::size_t>`. */
+/** Type-erased ops for awaitables that await-return `io_result<std::size_t>`. */
 struct any_buffer_source::read_awaitable_ops
 {
     bool (*await_ready)(void*);
@@ -421,9 +418,6 @@ struct any_buffer_source::vtable_for_impl
         return &ops;
     }
 
-    //------------------------------------------------------
-    // ReadSource forwarding (only instantiated when ReadSource<S>)
-
     static read_awaitable_ops const*
     construct_read_some_awaitable_impl(
         void* source,
@@ -484,8 +478,6 @@ struct any_buffer_source::vtable_for_impl
         return &ops;
     }
 
-    //------------------------------------------------------
-
     static consteval std::size_t
     compute_max_size() noexcept
     {
@@ -545,8 +537,6 @@ struct any_buffer_source::vtable_for_impl
     static constexpr vtable value = make_vtable();
 };
 
-//----------------------------------------------------------
-
 inline
 any_buffer_source::~any_buffer_source()
 {
@@ -615,8 +605,6 @@ any_buffer_source::any_buffer_source(S* s)
     cached_awaitable_ = ::operator new(vt_->awaitable_size);
 }
 
-//----------------------------------------------------------
-
 inline void
 any_buffer_source::consume(std::size_t n) noexcept
 {
@@ -665,9 +653,6 @@ any_buffer_source::pull(std::span<const_buffer> dest)
     return awaitable{this, dest};
 }
 
-//----------------------------------------------------------
-// Private helpers for native ReadSource forwarding
-
 inline auto
 any_buffer_source::read_some_(
     std::span<mutable_buffer const> buffers)
@@ -768,9 +753,6 @@ any_buffer_source::read_(
     return awaitable{this, buffers};
 }
 
-//----------------------------------------------------------
-// Public ReadSource methods
-
 template<MutableBufferSequence MB>
 io_task<std::size_t>
 any_buffer_source::read_some(MB buffers)
@@ -842,8 +824,6 @@ any_buffer_source::read(MB buffers)
     co_return {{}, total};
 }
 
-//----------------------------------------------------------
-
 static_assert(BufferSource<any_buffer_source>);
 static_assert(ReadSource<any_buffer_source>);
 
diff --git a/include/boost/capy/io/any_read_source.hpp b/include/boost/capy/io/any_read_source.hpp
index e8ea09059..5f441aa54 100644
--- a/include/boost/capy/io/any_read_source.hpp
+++ b/include/boost/capy/io/any_read_source.hpp
@@ -13,7 +13,7 @@
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/detail/await_suspend_helper.hpp>
 #include <boost/capy/buffers.hpp>
-#include <boost/capy/buffers/buffer_array.hpp>
+#include <boost/capy/detail/buffer_array.hpp>
 #include <boost/capy/buffers/buffer_param.hpp>
 #include <boost/capy/concept/io_awaitable.hpp>
 #include <boost/capy/concept/read_source.hpp>
@@ -24,6 +24,7 @@
 #include <concepts>
 #include <coroutine>
 #include <cstddef>
+#include <exception>
 #include <new>
 #include <span>
 #include <stop_token>
@@ -98,7 +99,7 @@ class any_read_source
     */
     ~any_read_source();
 
-    /** Default constructor.
+    /** Construct a default instance.
 
         Constructs an empty wrapper. Operations on a default-constructed
         wrapper result in undefined behavior.
@@ -112,7 +113,7 @@ class any_read_source
     any_read_source(any_read_source const&) = delete;
     any_read_source& operator=(any_read_source const&) = delete;
 
-    /** Move constructor.
+    /** Construct by moving.
 
         Transfers ownership of the wrapped source (if owned) and
         cached awaitable storage from `other`. After the move, `other` is
@@ -129,7 +130,7 @@ class any_read_source
     {
     }
 
-    /** Move assignment operator.
+    /** Assign by moving.
 
         Destroys any owned source and releases existing resources,
         then transfers ownership from `other`.
@@ -185,12 +186,13 @@ class any_read_source
 
     /** Initiate a partial read operation.
 
-        Reads one or more bytes into the provided buffer sequence.
-        May fill less than the full sequence.
+        Attempt to read up to `buffer_size( buffers )` bytes into
+        the provided buffer sequence. May fill less than the
+        full sequence.
 
         @param buffers The buffer sequence to read into.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @par Immediate Completion
         The operation completes immediately without suspending
@@ -223,7 +225,7 @@ class any_read_source
 
         @param buffers The buffer sequence to read into.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @par Immediate Completion
         The operation completes immediately without suspending
@@ -275,8 +277,6 @@ class any_read_source
     read_(std::span<mutable_buffer const> buffers);
 };
 
-//----------------------------------------------------------
-
 // ordered by call sequence for cache line coherence
 struct any_read_source::awaitable_ops
 {
@@ -388,8 +388,6 @@ struct any_read_source::vtable_for_impl
     };
 };
 
-//----------------------------------------------------------
-
 inline
 any_read_source::~any_read_source()
 {
@@ -467,8 +465,6 @@ any_read_source::any_read_source(S* s)
     cached_awaitable_ = ::operator new(vt_->awaitable_size);
 }
 
-//----------------------------------------------------------
-
 template<MutableBufferSequence MB>
 auto
 any_read_source::read_some(MB buffers)
@@ -476,7 +472,7 @@ any_read_source::read_some(MB buffers)
     struct awaitable
     {
         any_read_source* self_;
-        mutable_buffer_array<detail::max_iovec_> ba_;
+        detail::mutable_buffer_array<detail::max_iovec_> ba_;
 
         awaitable(any_read_source* self, MB const& buffers)
             : self_(self)
diff --git a/include/boost/capy/io/any_read_stream.hpp b/include/boost/capy/io/any_read_stream.hpp
index dc5d3598d..ddbcc3b23 100644
--- a/include/boost/capy/io/any_read_stream.hpp
+++ b/include/boost/capy/io/any_read_stream.hpp
@@ -13,7 +13,7 @@
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/detail/await_suspend_helper.hpp>
 #include <boost/capy/buffers.hpp>
-#include <boost/capy/buffers/buffer_array.hpp>
+#include <boost/capy/detail/buffer_array.hpp>
 #include <boost/capy/concept/io_awaitable.hpp>
 #include <boost/capy/concept/read_stream.hpp>
 #include <boost/capy/ex/io_env.hpp>
@@ -22,6 +22,7 @@
 #include <concepts>
 #include <coroutine>
 #include <cstddef>
+#include <exception>
 #include <new>
 #include <span>
 #include <stop_token>
@@ -96,7 +97,7 @@ class any_read_stream
     */
     ~any_read_stream();
 
-    /** Default constructor.
+    /** Construct a default instance.
 
         Constructs an empty wrapper. Operations on a default-constructed
         wrapper result in undefined behavior.
@@ -110,7 +111,7 @@ class any_read_stream
     any_read_stream(any_read_stream const&) = delete;
     any_read_stream& operator=(any_read_stream const&) = delete;
 
-    /** Move constructor.
+    /** Construct by moving.
 
         Transfers ownership of the wrapped stream (if owned) and
         cached awaitable storage from `other`. After the move, `other` is
@@ -127,7 +128,7 @@ class any_read_stream
     {
     }
 
-    /** Move assignment operator.
+    /** Assign by moving.
 
         Destroys any owned stream and releases existing resources,
         then transfers ownership from `other`.
@@ -191,7 +192,7 @@ class any_read_stream
             value to ensure the sequence lives in the coroutine frame
             across suspension points.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @par Immediate Completion
         The operation completes immediately without suspending
@@ -234,8 +235,6 @@ class any_read_stream
     }
 };
 
-//----------------------------------------------------------
-
 struct any_read_stream::vtable
 {
     // ordered by call frequency for cache line coherence
@@ -295,8 +294,6 @@ struct any_read_stream::vtable_for_impl
     };
 };
 
-//----------------------------------------------------------
-
 inline
 any_read_stream::~any_read_stream()
 {
@@ -374,8 +371,6 @@ any_read_stream::any_read_stream(S* s)
     cached_awaitable_ = ::operator new(vt_->awaitable_size);
 }
 
-//----------------------------------------------------------
-
 template<MutableBufferSequence MB>
 auto
 any_read_stream::read_some(MB buffers)
@@ -385,7 +380,7 @@ any_read_stream::read_some(MB buffers)
     struct awaitable
     {
         any_read_stream* self_;
-        mutable_buffer_array<detail::max_iovec_> ba_;
+        detail::mutable_buffer_array<detail::max_iovec_> ba_;
 
         bool
         await_ready()
@@ -422,7 +417,7 @@ any_read_stream::read_some(MB buffers)
         }
     };
     return awaitable{this,
-        mutable_buffer_array<detail::max_iovec_>(buffers)};
+        detail::mutable_buffer_array<detail::max_iovec_>(buffers)};
 }
 
 } // namespace capy
diff --git a/include/boost/capy/io/any_stream.hpp b/include/boost/capy/io/any_stream.hpp
index 3d135bdca..e8bf6a0d6 100644
--- a/include/boost/capy/io/any_stream.hpp
+++ b/include/boost/capy/io/any_stream.hpp
@@ -99,7 +99,7 @@ class any_stream
         }
     }
 
-    /** Default constructor.
+    /** Construct a default instance.
 
         Constructs an empty wrapper. Operations on a default-constructed
         wrapper result in undefined behavior.
@@ -113,7 +113,7 @@ class any_stream
     any_stream(any_stream const&) = delete;
     any_stream& operator=(any_stream const&) = delete;
 
-    /** Move constructor.
+    /** Construct by moving.
 
         Transfers ownership from both bases and the owned stream (if any).
 
@@ -128,7 +128,7 @@ class any_stream
     {
     }
 
-    /** Move assignment operator.
+    /** Assign by moving.
 
         Destroys any owned stream and releases existing resources,
         then transfers ownership from `other`.
diff --git a/include/boost/capy/io/any_write_sink.hpp b/include/boost/capy/io/any_write_sink.hpp
index 77d3b7b37..1cdfd43dd 100644
--- a/include/boost/capy/io/any_write_sink.hpp
+++ b/include/boost/capy/io/any_write_sink.hpp
@@ -13,7 +13,7 @@
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/detail/await_suspend_helper.hpp>
 #include <boost/capy/buffers.hpp>
-#include <boost/capy/buffers/buffer_array.hpp>
+#include <boost/capy/detail/buffer_array.hpp>
 #include <boost/capy/buffers/buffer_param.hpp>
 #include <boost/capy/concept/io_awaitable.hpp>
 #include <boost/capy/concept/write_sink.hpp>
@@ -103,7 +103,7 @@ class any_write_sink
     */
     ~any_write_sink();
 
-    /** Default constructor.
+    /** Construct a default instance.
 
         Constructs an empty wrapper. Operations on a default-constructed
         wrapper result in undefined behavior.
@@ -117,7 +117,7 @@ class any_write_sink
     any_write_sink(any_write_sink const&) = delete;
     any_write_sink& operator=(any_write_sink const&) = delete;
 
-    /** Move constructor.
+    /** Construct by moving.
 
         Transfers ownership of the wrapped sink (if owned) and
         cached awaitable storage from `other`. After the move, `other` is
@@ -135,7 +135,7 @@ class any_write_sink
     {
     }
 
-    /** Move assignment operator.
+    /** Assign by moving.
 
         Destroys any owned sink and releases existing resources,
         then transfers ownership from `other`.
@@ -191,12 +191,13 @@ class any_write_sink
 
     /** Initiate a partial write operation.
 
-        Writes one or more bytes from the provided buffer sequence.
-        May consume less than the full sequence.
+        Attempt to write up to `buffer_size( buffers )` bytes from
+        the provided buffer sequence. May consume less than the
+        full sequence.
 
         @param buffers The buffer sequence containing data to write.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @par Immediate Completion
         The operation completes immediately without suspending
@@ -226,7 +227,7 @@ class any_write_sink
 
         @param buffers The buffer sequence containing data to write.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @par Immediate Completion
         The operation completes immediately without suspending
@@ -256,7 +257,7 @@ class any_write_sink
 
         @param buffers The buffer sequence containing data to write.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @par Immediate Completion
         The operation completes immediately without suspending
@@ -279,7 +280,7 @@ class any_write_sink
         The operation completes when the sink is finalized, or
         an error occurs.
 
-        @return An awaitable yielding `(error_code)`.
+        @return An awaitable that await-returns `(error_code)`.
 
         @par Immediate Completion
         The operation completes immediately without suspending
@@ -325,8 +326,6 @@ class any_write_sink
     write_eof_buffers_(std::span<const_buffer const> buffers);
 };
 
-//----------------------------------------------------------
-
 struct any_write_sink::write_awaitable_ops
 {
     bool (*await_ready)(void*);
@@ -521,8 +520,6 @@ struct any_write_sink::vtable_for_impl
     };
 };
 
-//----------------------------------------------------------
-
 inline
 any_write_sink::~any_write_sink()
 {
@@ -605,8 +602,6 @@ any_write_sink::any_write_sink(S* s)
     cached_awaitable_ = ::operator new(vt_->awaitable_size);
 }
 
-//----------------------------------------------------------
-
 inline auto
 any_write_sink::write_some_(
     std::span<const_buffer const> buffers)
@@ -803,7 +798,7 @@ any_write_sink::write_some(CB buffers)
     struct awaitable
     {
         any_write_sink* self_;
-        const_buffer_array<detail::max_iovec_> ba_;
+        detail::const_buffer_array<detail::max_iovec_> ba_;
 
         awaitable(
             any_write_sink* self,
diff --git a/include/boost/capy/io/any_write_stream.hpp b/include/boost/capy/io/any_write_stream.hpp
index a7d871abb..40088127b 100644
--- a/include/boost/capy/io/any_write_stream.hpp
+++ b/include/boost/capy/io/any_write_stream.hpp
@@ -13,7 +13,7 @@
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/detail/await_suspend_helper.hpp>
 #include <boost/capy/buffers.hpp>
-#include <boost/capy/buffers/buffer_array.hpp>
+#include <boost/capy/detail/buffer_array.hpp>
 #include <boost/capy/concept/io_awaitable.hpp>
 #include <boost/capy/concept/write_stream.hpp>
 #include <coroutine>
@@ -23,6 +23,7 @@
 #include <concepts>
 #include <coroutine>
 #include <cstddef>
+#include <exception>
 #include <new>
 #include <span>
 #include <stop_token>
@@ -97,7 +98,7 @@ class any_write_stream
     */
     ~any_write_stream();
 
-    /** Default constructor.
+    /** Construct a default instance.
 
         Constructs an empty wrapper. Operations on a default-constructed
         wrapper result in undefined behavior.
@@ -111,7 +112,7 @@ class any_write_stream
     any_write_stream(any_write_stream const&) = delete;
     any_write_stream& operator=(any_write_stream const&) = delete;
 
-    /** Move constructor.
+    /** Construct by moving.
 
         Transfers ownership of the wrapped stream (if owned) and
         cached awaitable storage from `other`. After the move, `other` is
@@ -128,7 +129,7 @@ class any_write_stream
     {
     }
 
-    /** Move assignment operator.
+    /** Assign by moving.
 
         Destroys any owned stream and releases existing resources,
         then transfers ownership from `other`.
@@ -192,7 +193,7 @@ class any_write_stream
             Passed by value to ensure the sequence lives in the
             coroutine frame across suspension points.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @par Immediate Completion
         The operation completes immediately without suspending
@@ -235,8 +236,6 @@ class any_write_stream
     }
 };
 
-//----------------------------------------------------------
-
 struct any_write_stream::vtable
 {
     // ordered by call frequency for cache line coherence
@@ -296,8 +295,6 @@ struct any_write_stream::vtable_for_impl
     };
 };
 
-//----------------------------------------------------------
-
 inline
 any_write_stream::~any_write_stream()
 {
@@ -375,8 +372,6 @@ any_write_stream::any_write_stream(S* s)
     cached_awaitable_ = ::operator new(vt_->awaitable_size);
 }
 
-//----------------------------------------------------------
-
 template<ConstBufferSequence CB>
 auto
 any_write_stream::write_some(CB buffers)
@@ -384,7 +379,7 @@ any_write_stream::write_some(CB buffers)
     struct awaitable
     {
         any_write_stream* self_;
-        const_buffer_array<detail::max_iovec_> ba_;
+        detail::const_buffer_array<detail::max_iovec_> ba_;
 
         awaitable(
             any_write_stream* self,
diff --git a/include/boost/capy/io/pull_from.hpp b/include/boost/capy/io/pull_from.hpp
index 197161b77..e639add0c 100644
--- a/include/boost/capy/io/pull_from.hpp
+++ b/include/boost/capy/io/pull_from.hpp
@@ -81,13 +81,11 @@ pull_from(Src& source, Sink& sink)
         auto [ec, n] = co_await source.read(
             std::span<mutable_buffer const>(dst_bufs));
 
-        if(n > 0)
-        {
-            auto [commit_ec] = co_await sink.commit(n);
-            if(commit_ec)
-                co_return {commit_ec, total};
-            total += n;
-        }
+        auto [commit_ec] = co_await sink.commit(n);
+        total += n;
+
+        if(commit_ec)
+            co_return {commit_ec, total};
 
         if(ec == cond::eof)
         {
@@ -165,16 +163,12 @@ pull_from(Src& source, Sink& sink)
         auto [ec, n] = co_await source.read_some(
             std::span<mutable_buffer const>(dst_bufs));
 
-        // Commit any data that was read
-        if(n > 0)
-        {
-            auto [commit_ec] = co_await sink.commit(n);
-            if(commit_ec)
-                co_return {commit_ec, total};
-            total += n;
-        }
+        auto [commit_ec] = co_await sink.commit(n);
+        total += n;
+
+        if(commit_ec)
+            co_return {commit_ec, total};
 
-        // Check for EOF condition
         if(ec == cond::eof)
         {
             auto [eof_ec] = co_await sink.commit_eof(0);
diff --git a/include/boost/capy/io/push_to.hpp b/include/boost/capy/io/push_to.hpp
index cbff60950..d772d4b4c 100644
--- a/include/boost/capy/io/push_to.hpp
+++ b/include/boost/capy/io/push_to.hpp
@@ -139,11 +139,10 @@ push_to(Src& source, Stream& stream)
             co_return {ec, total};
 
         auto [write_ec, n] = co_await stream.write_some(bufs);
-        if(write_ec)
-            co_return {write_ec, total};
-
         total += n;
         source.consume(n);
+        if(write_ec)
+            co_return {write_ec, total};
     }
 }
 
diff --git a/include/boost/capy/io/write_now.hpp b/include/boost/capy/io/write_now.hpp
index 4cfe6f8b3..9e5a8cc0b 100644
--- a/include/boost/capy/io/write_now.hpp
+++ b/include/boost/capy/io/write_now.hpp
@@ -13,7 +13,7 @@
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/detail/await_suspend_helper.hpp>
 #include <boost/capy/buffers.hpp>
-#include <boost/capy/buffers/consuming_buffers.hpp>
+#include <boost/capy/buffers/buffer_slice.hpp>
 #include <boost/capy/concept/io_awaitable.hpp>
 #include <boost/capy/concept/write_stream.hpp>
 #include <coroutine>
@@ -302,7 +302,7 @@ class write_now
             value to ensure the sequence lives in the coroutine
             frame across suspension points.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
             On success, `n` equals `buffer_size(buffers)`. On
             error, `n` is the number of bytes written before the
             error. Compare error codes to conditions:
@@ -336,16 +336,16 @@ class write_now
     {
         std::size_t const total_size = buffer_size(buffers);
         std::size_t total_written = 0;
-        consuming_buffers cb(buffers);
+        auto cb = buffer_slice(buffers);
         while(total_written < total_size)
         {
             auto r =
-                co_await stream_.write_some(cb);
+                co_await stream_.write_some(cb.data());
+            cb.remove_prefix(std::get<0>(r.values));
+            total_written += std::get<0>(r.values);
             if(r.ec)
                 co_return io_result<std::size_t>{
                     r.ec, total_written};
-            cb.consume(r.t1);
-            total_written += r.t1;
         }
         co_return io_result<std::size_t>{
             {}, total_written};
@@ -359,20 +359,20 @@ class write_now
         std::size_t total_written = 0;
 
         // GCC ICE in expand_expr_real_1 (expr.cc:11376)
-        // when consuming_buffers spans a co_yield, so
+        // when the buffer slice spans a co_yield, so
         // the GCC path uses a separate simple coroutine.
-        consuming_buffers cb(buffers);
+        auto cb = buffer_slice(buffers);
         while(total_written < total_size)
         {
-            auto inner = stream_.write_some(cb);
+            auto inner = stream_.write_some(cb.data());
             if(!inner.await_ready())
                 break;
             auto r = inner.await_resume();
             if(r.ec)
                 co_return io_result<std::size_t>{
                     r.ec, total_written};
-            cb.consume(r.t1);
-            total_written += r.t1;
+            cb.remove_prefix(std::get<0>(r.values));
+            total_written += std::get<0>(r.values);
         }
 
         if(total_written >= total_size)
@@ -384,12 +384,12 @@ class write_now
         while(total_written < total_size)
         {
             auto r =
-                co_await stream_.write_some(cb);
+                co_await stream_.write_some(cb.data());
+            cb.remove_prefix(std::get<0>(r.values));
+            total_written += std::get<0>(r.values);
             if(r.ec)
                 co_return io_result<std::size_t>{
                     r.ec, total_written};
-            cb.consume(r.t1);
-            total_written += r.t1;
         }
         co_return io_result<std::size_t>{
             {}, total_written};
diff --git a/include/boost/capy/io_result.hpp b/include/boost/capy/io_result.hpp
index 6c476d800..ac2f13434 100644
--- a/include/boost/capy/io_result.hpp
+++ b/include/boost/capy/io_result.hpp
@@ -14,6 +14,7 @@
 #include <system_error>
 
 #include <cstddef>
+#include <tuple>
 #include <type_traits>
 #include <utility>
 
@@ -24,376 +25,108 @@ namespace capy {
 
     This template provides a unified result type for async operations,
     always containing a `std::error_code` plus optional additional
-    values. It supports structured bindings.
-
-    @tparam Args Additional value types beyond the error code.
-
-    @par Usage
-    @code
-    auto [ec, n] = co_await s.read_some(buf);
-    if (ec) { ... }
-    @endcode
-*/
-template<class... Args>
-struct io_result
-{
-    static_assert("io_result only supports up to 3 template arguments");
-};
-
-/** Result type for void operations.
-
-    Used by operations like `connect()` that don't return a value
-    beyond success/failure. This specialization is not an aggregate
-    to enable implicit conversion from `error_code`.
-
-    @par Example
-    @code
-    auto [ec] = co_await s.connect(ep);
-    if (ec) { ... }
-    @endcode
-*/
-template<>
-struct [[nodiscard]] io_result<>
-{
-    /** The error code from the operation. */
-    std::error_code ec;
-
-#ifdef _MSC_VER
-    // Tuple protocol (unconditional - io_result<> is not an aggregate)
-    template<std::size_t I>
-    auto& get() & noexcept
-    {
-        static_assert(I == 0, "index out of range");
-        return ec;
-    }
-
-    template<std::size_t I>
-    auto const& get() const& noexcept
-    {
-        static_assert(I == 0, "index out of range");
-        return ec;
-    }
-
-    template<std::size_t I>
-    auto&& get() && noexcept
-    {
-        static_assert(I == 0, "index out of range");
-        return std::move(ec);
-    }
-#endif
-};
-
-/** Result type for byte transfer operations.
-
-    Used by operations like `read_some()` and `write_some()` that
-    return the number of bytes transferred.
+    values. It supports structured bindings via the tuple protocol.
 
     @par Example
     @code
     auto [ec, n] = co_await s.read_some(buf);
     if (ec) { ... }
     @endcode
-*/
-template<typename T1>
-struct [[nodiscard]] io_result<T1>
-{
-    std::error_code ec;
-    T1 t1{};
-
-#ifdef _MSC_VER
-    template<std::size_t I>
-    auto& get() & noexcept
-    {
-        static_assert(I < 2, "index out of range");
-        if constexpr (I == 0) return ec;
-        else return t1;
-    }
-
-    template<std::size_t I>
-    auto const& get() const& noexcept
-    {
-        static_assert(I < 2, "index out of range");
-        if constexpr (I == 0) return ec;
-        else return t1;
-    }
 
-    template<std::size_t I>
-    auto&& get() && noexcept
-    {
-        static_assert(I < 2, "index out of range");
-        if constexpr (I == 0) return std::move(ec);
-        else return std::move(t1);
-    }
-#endif
-};
+    @note Payload members are only meaningful when
+        `ec` does not indicate an error.
 
-template<typename T1, typename T2>
-struct [[nodiscard]] io_result<T1, T2>
+    @tparam Ts Ordered payload types following the leading
+        `std::error_code`.
+*/
+template<class... Ts>
+struct [[nodiscard]] io_result
 {
+    /// The error code from the operation.
     std::error_code ec;
-    T1 t1{};
-    T2 t2{};
 
-#ifdef _MSC_VER
-    template<std::size_t I>
-    auto& get() & noexcept
-    {
-        static_assert(I < 3, "index out of range");
-        if constexpr (I == 0) return ec;
-        else if constexpr (I == 1) return t1;
-        else return t2;
-    }
+    /// The payload values. Unspecified when `ec` is set.
+    std::tuple<Ts...> values;
 
-    template<std::size_t I>
-    auto const& get() const& noexcept
-    {
-        static_assert(I < 3, "index out of range");
-        if constexpr (I == 0) return ec;
-        else if constexpr (I == 1) return t1;
-        else return t2;
-    }
+    /// Construct a default io_result.
+    io_result() = default;
 
-    template<std::size_t I>
-    auto&& get() && noexcept
+    /// Construct from an error code and payload values.
+    io_result(std::error_code ec_, Ts... ts)
+        : ec(ec_)
+        , values(std::move(ts)...)
     {
-        static_assert(I < 3, "index out of range");
-        if constexpr (I == 0) return std::move(ec);
-        else if constexpr (I == 1) return std::move(t1);
-        else return std::move(t2);
     }
-#endif
-};
-
-template<typename T1, typename T2, typename T3>
-struct [[nodiscard]] io_result<T1, T2, T3>
-{
-    std::error_code ec;
-    T1 t1{};
-    T2 t2{};
-    T3 t3{};
 
-#ifdef _MSC_VER
+    /// @cond
     template<std::size_t I>
-    auto& get() & noexcept
+    decltype(auto) get() & noexcept
     {
-        static_assert(I < 4, "index out of range");
-        if constexpr (I == 0) return ec;
-        else if constexpr (I == 1) return t1;
-        else if constexpr (I == 2) return t2;
-        else return t3;
+        static_assert(I < 1 + sizeof...(Ts), "index out of range");
+        if constexpr (I == 0) return (ec);
+        else return std::get<I - 1>(values);
     }
 
     template<std::size_t I>
-    auto const& get() const& noexcept
+    decltype(auto) get() const& noexcept
     {
-        static_assert(I < 4, "index out of range");
-        if constexpr (I == 0) return ec;
-        else if constexpr (I == 1) return t1;
-        else if constexpr (I == 2) return t2;
-        else return t3;
+        static_assert(I < 1 + sizeof...(Ts), "index out of range");
+        if constexpr (I == 0) return (ec);
+        else return std::get<I - 1>(values);
     }
 
     template<std::size_t I>
-    auto&& get() && noexcept
+    decltype(auto) get() && noexcept
     {
-        static_assert(I < 4, "index out of range");
+        static_assert(I < 1 + sizeof...(Ts), "index out of range");
         if constexpr (I == 0) return std::move(ec);
-        else if constexpr (I == 1) return std::move(t1);
-        else if constexpr (I == 2) return std::move(t2);
-        else return std::move(t3);
+        else return std::get<I - 1>(std::move(values));
     }
-#endif
+    /// @endcond
 };
 
-//------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-
-// Free-standing get() overloads for ADL (MSVC workaround for aggregates)
-
-template<std::size_t I>
-auto& get(io_result<>& r) noexcept
+/// @cond
+template<std::size_t I, class... Ts>
+decltype(auto) get(io_result<Ts...>& r) noexcept
 {
     return r.template get<I>();
 }
 
-template<std::size_t I>
-auto const& get(io_result<> const& r) noexcept
+template<std::size_t I, class... Ts>
+decltype(auto) get(io_result<Ts...> const& r) noexcept
 {
     return r.template get<I>();
 }
 
-template<std::size_t I>
-auto&& get(io_result<>&& r) noexcept
+template<std::size_t I, class... Ts>
+decltype(auto) get(io_result<Ts...>&& r) noexcept
 {
     return std::move(r).template get<I>();
 }
-
-template<std::size_t I, typename T1>
-auto& get(io_result<T1>& r) noexcept
-{
-    return r.template get<I>();
-}
-
-template<std::size_t I, typename T1>
-auto const& get(io_result<T1> const& r) noexcept
-{
-    return r.template get<I>();
-}
-
-template<std::size_t I, typename T1>
-auto&& get(io_result<T1>&& r) noexcept
-{
-    return std::move(r).template get<I>();
-}
-
-template<std::size_t I, typename T1, typename T2>
-auto& get(io_result<T1, T2>& r) noexcept
-{
-    return r.template get<I>();
-}
-
-template<std::size_t I, typename T1, typename T2>
-auto const& get(io_result<T1, T2> const& r) noexcept
-{
-    return r.template get<I>();
-}
-
-template<std::size_t I, typename T1, typename T2>
-auto&& get(io_result<T1, T2>&& r) noexcept
-{
-    return std::move(r).template get<I>();
-}
-
-template<std::size_t I, typename T1, typename T2, typename T3>
-auto& get(io_result<T1, T2, T3>& r) noexcept
-{
-    return r.template get<I>();
-}
-
-template<std::size_t I, typename T1, typename T2, typename T3>
-auto const& get(io_result<T1, T2, T3> const& r) noexcept
-{
-    return r.template get<I>();
-}
-
-template<std::size_t I, typename T1, typename T2, typename T3>
-auto&& get(io_result<T1, T2, T3>&& r) noexcept
-{
-    return std::move(r).template get<I>();
-}
-
-#endif // _MSC_VER
+/// @endcond
 
 } // namespace capy
 } // namespace boost
 
-//------------------------------------------------------------------------------
-
-#ifdef _MSC_VER
-
-// Tuple protocol for structured bindings (MSVC workaround)
-// MSVC has a bug with aggregate decomposition in coroutines, so we use
-// tuple protocol instead which forces the compiler to use get<>() functions.
-
+// Tuple protocol for structured bindings
 namespace std {
 
-template<>
-struct tuple_size<boost::capy::io_result<>>
-    : std::integral_constant<std::size_t, 1> {};
+template<class... Ts>
+struct tuple_size<boost::capy::io_result<Ts...>>
+    : std::integral_constant<std::size_t, 1 + sizeof...(Ts)> {};
 
-template<>
-struct tuple_element<0, boost::capy::io_result<>>
+template<class... Ts>
+struct tuple_element<0, boost::capy::io_result<Ts...>>
 {
-    using type = ::std::error_code;
+    using type = std::error_code;
 };
 
-template<typename T1>
-struct tuple_size<boost::capy::io_result<T1>>
-    : std::integral_constant<std::size_t, 2> {};
-
-template<typename T1, typename T2>
-struct tuple_size<boost::capy::io_result<T1, T2>>
-    : std::integral_constant<std::size_t, 3> {};
-
-template<typename T1, typename T2, typename T3>
-struct tuple_size<boost::capy::io_result<T1, T2, T3>>
-    : std::integral_constant<std::size_t, 4> {};
-
-// tuple_element specializations for io_result<T1>
-
-template<>
-struct tuple_element<0, boost::capy::io_result<std::size_t>>
-{
-    using type = ::std::error_code;
-};
-
-template<>
-struct tuple_element<1, boost::capy::io_result<std::size_t>>
-{
-    using type = std::size_t;
-};
-
-template<typename T1>
-struct tuple_element<0, boost::capy::io_result<T1>>
-{
-    using type = ::std::error_code;
-};
-
-template<typename T1>
-struct tuple_element<1, boost::capy::io_result<T1>>
-{
-    using type = T1;
-};
-
-// tuple_element specializations for io_result<T1, T2>
-
-template<typename T1, typename T2>
-struct tuple_element<0, boost::capy::io_result<T1, T2>>
-{
-    using type = ::std::error_code;
-};
-
-template<typename T1, typename T2>
-struct tuple_element<1, boost::capy::io_result<T1, T2>>
+template<std::size_t I, class... Ts>
+struct tuple_element<I, boost::capy::io_result<Ts...>>
 {
-    using type = T1;
-};
-
-template<typename T1, typename T2>
-struct tuple_element<2, boost::capy::io_result<T1, T2>>
-{
-    using type = T2;
-};
-
-// tuple_element specializations for io_result<T1, T2, T3>
-
-template<typename T1, typename T2, typename T3>
-struct tuple_element<0, boost::capy::io_result<T1, T2, T3>>
-{
-    using type = ::std::error_code;
-};
-
-template<typename T1, typename T2, typename T3>
-struct tuple_element<1, boost::capy::io_result<T1, T2, T3>>
-{
-    using type = T1;
-};
-
-template<typename T1, typename T2, typename T3>
-struct tuple_element<2, boost::capy::io_result<T1, T2, T3>>
-{
-    using type = T2;
-};
-
-template<typename T1, typename T2, typename T3>
-struct tuple_element<3, boost::capy::io_result<T1, T2, T3>>
-{
-    using type = T3;
+    using type = std::tuple_element_t<I - 1, std::tuple<Ts...>>;
 };
 
 } // namespace std
 
-#endif // _MSC_VER
-
 #endif // BOOST_CAPY_IO_RESULT_HPP
diff --git a/include/boost/capy/quitter.hpp b/include/boost/capy/quitter.hpp
new file mode 100644
index 000000000..c4be6fefd
--- /dev/null
+++ b/include/boost/capy/quitter.hpp
@@ -0,0 +1,375 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_QUITTER_HPP
+#define BOOST_CAPY_QUITTER_HPP
+
+#include <boost/capy/detail/config.hpp>
+#include <boost/capy/detail/stop_requested_exception.hpp>
+#include <boost/capy/concept/executor.hpp>
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/ex/io_awaitable_promise_base.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/ex/frame_allocator.hpp>
+#include <boost/capy/detail/await_suspend_helper.hpp>
+
+#include <exception>
+#include <optional>
+#include <type_traits>
+#include <utility>
+
+/* Stop-aware coroutine task.
+
+   quitter<T> is identical to task<T> except that when the stop token
+   is triggered, the coroutine body never sees the cancellation.  The
+   promise intercepts it on resume (in transform_awaiter::await_resume)
+   and throws a sentinel exception that unwinds through RAII destructors
+   to final_suspend.  The parent sees a "stopped" completion.
+
+   See doc/quitter.md for the full design rationale. */
+
+namespace boost {
+namespace capy {
+
+namespace detail {
+
+// Reuse the same return-value storage as task<T>.
+// task_return_base is defined in task.hpp, but quitter needs its own
+// copy to avoid a header dependency on task.hpp.
+template<typename T>
+struct quitter_return_base
+{
+    std::optional<T> result_;
+
+    void return_value(T value)
+    {
+        result_ = std::move(value);
+    }
+
+    T&& result() noexcept
+    {
+        return std::move(*result_);
+    }
+};
+
+template<>
+struct quitter_return_base<void>
+{
+    void return_void()
+    {
+    }
+};
+
+} // namespace detail
+
+/** Stop-aware lazy coroutine task satisfying @ref IoRunnable.
+
+    When the stop token is triggered, the next `co_await` inside the
+    coroutine short-circuits: the body never sees the result and RAII
+    destructors run normally.  The parent observes a "stopped"
+    completion via @ref promise_type::stopped.
+
+    Everything else — frame allocation, environment propagation,
+    symmetric transfer, move semantics — is identical to @ref task.
+
+    @tparam T The result type.  Use `quitter<>` for `quitter<void>`.
+
+    @see task, IoRunnable, IoAwaitable
+*/
+template<typename T = void>
+struct [[nodiscard]] BOOST_CAPY_CORO_AWAIT_ELIDABLE
+    quitter
+{
+    struct promise_type
+        : io_awaitable_promise_base<promise_type>
+        , detail::quitter_return_base<T>
+    {
+    private:
+        friend quitter;
+
+        enum class completion { running, value, exception, stopped };
+
+        union { std::exception_ptr ep_; };
+        completion state_;
+
+    public:
+        promise_type() noexcept
+            : state_(completion::running)
+        {
+        }
+
+        ~promise_type()
+        {
+            if(state_ == completion::exception ||
+               state_ == completion::stopped)
+                ep_.~exception_ptr();
+        }
+
+        /// Return a non-null exception_ptr when the coroutine threw
+        /// or was stopped.  Stopped quitters report the sentinel
+        /// stop_requested_exception so that run_async routes to
+        /// the error handler instead of accessing a non-existent
+        /// result.
+        std::exception_ptr exception() const noexcept
+        {
+            if(state_ == completion::exception ||
+               state_ == completion::stopped)
+                return ep_;
+            return {};
+        }
+
+        /// True when the coroutine was stopped via the stop token.
+        bool stopped() const noexcept
+        {
+            return state_ == completion::stopped;
+        }
+
+        quitter get_return_object()
+        {
+            return quitter{
+                std::coroutine_handle<promise_type>::from_promise(*this)};
+        }
+
+        auto initial_suspend() noexcept
+        {
+            struct awaiter
+            {
+                promise_type* p_;
+
+                bool await_ready() const noexcept
+                {
+                    return false;
+                }
+
+                void await_suspend(std::coroutine_handle<>) const noexcept
+                {
+                }
+
+                // Potentially-throwing: checks the stop token before
+                // the coroutine body executes its first statement.
+                void await_resume() const
+                {
+                    set_current_frame_allocator(
+                        p_->environment()->frame_allocator);
+                    if(p_->environment()->stop_token.stop_requested())
+                        throw detail::stop_requested_exception{};
+                }
+            };
+            return awaiter{this};
+        }
+
+        auto final_suspend() noexcept
+        {
+            struct awaiter
+            {
+                promise_type* p_;
+
+                bool await_ready() const noexcept
+                {
+                    return false;
+                }
+
+                std::coroutine_handle<> await_suspend(
+                    std::coroutine_handle<>) const noexcept
+                {
+                    return p_->continuation();
+                }
+
+                void await_resume() const noexcept
+                {
+                }
+            };
+            return awaiter{this};
+        }
+
+        void unhandled_exception()
+        {
+            try
+            {
+                throw;
+            }
+            catch(detail::stop_requested_exception const&)
+            {
+                // Store the exception_ptr so that run_async's
+                // invoke_impl routes to the error handler
+                // instead of accessing a non-existent result.
+                new (&ep_) std::exception_ptr(
+                    std::current_exception());
+                state_ = completion::stopped;
+            }
+            catch(...)
+            {
+                new (&ep_) std::exception_ptr(
+                    std::current_exception());
+                state_ = completion::exception;
+            }
+        }
+
+        //------------------------------------------------------
+        // transform_awaitable — the key difference from task<T>
+        //------------------------------------------------------
+
+        template<class Awaitable>
+        struct transform_awaiter
+        {
+            std::decay_t<Awaitable> a_;
+            promise_type* p_;
+
+            bool await_ready() noexcept
+            {
+                return a_.await_ready();
+            }
+
+            // Check the stop token BEFORE the coroutine body
+            // sees the result of the I/O operation.
+            decltype(auto) await_resume()
+            {
+                set_current_frame_allocator(
+                    p_->environment()->frame_allocator);
+                if(p_->environment()->stop_token.stop_requested())
+                    throw detail::stop_requested_exception{};
+                return a_.await_resume();
+            }
+
+            template<class Promise>
+            auto await_suspend(
+                std::coroutine_handle<Promise> h) noexcept
+            {
+                using R = decltype(
+                    a_.await_suspend(h, p_->environment()));
+                if constexpr (std::is_same_v<
+                    R, std::coroutine_handle<>>)
+                    return detail::symmetric_transfer(
+                        a_.await_suspend(h, p_->environment()));
+                else
+                    return a_.await_suspend(
+                        h, p_->environment());
+            }
+        };
+
+        template<class Awaitable>
+        auto transform_awaitable(Awaitable&& a)
+        {
+            using A = std::decay_t<Awaitable>;
+            if constexpr (IoAwaitable<A>)
+            {
+                return transform_awaiter<Awaitable>{
+                    std::forward<Awaitable>(a), this};
+            }
+            else
+            {
+                static_assert(sizeof(A) == 0,
+                    "requires IoAwaitable");
+            }
+        }
+    };
+
+    std::coroutine_handle<promise_type> h_;
+
+    /// Destroy the quitter and its coroutine frame if owned.
+    ~quitter()
+    {
+        if(h_)
+            h_.destroy();
+    }
+
+    /// Return false; quitters are never immediately ready.
+    bool await_ready() const noexcept
+    {
+        return false;
+    }
+
+    /** Return the result, rethrow exception, or propagate stop.
+
+        When stopped, throws stop_requested_exception so that a
+        parent quitter also stops.  A parent task<T> will see this
+        as an unhandled exception — by design.
+    */
+    auto await_resume()
+    {
+        if(h_.promise().stopped())
+            throw detail::stop_requested_exception{};
+        if(h_.promise().state_ == promise_type::completion::exception)
+            std::rethrow_exception(h_.promise().ep_);
+        if constexpr (! std::is_void_v<T>)
+            return std::move(*h_.promise().result_);
+        else
+            return;
+    }
+
+    /// Start execution with the caller's context.
+    std::coroutine_handle<> await_suspend(
+        std::coroutine_handle<> cont,
+        io_env const* env)
+    {
+        h_.promise().set_continuation(cont);
+        h_.promise().set_environment(env);
+        return h_;
+    }
+
+    /** Return the coroutine handle.
+
+        @note Do not call `destroy()` on the returned handle while
+        the quitter is being awaited. The quitter's lifetime is
+        normally managed by `run_async`, `run`, or the awaiting
+        parent; manually destroying a suspended quitter that another
+        coroutine is awaiting produces undefined behavior. For
+        cooperative cancellation, use `std::stop_token`.
+
+        @return The coroutine handle.
+    */
+    std::coroutine_handle<promise_type> handle() const noexcept
+    {
+        return h_;
+    }
+
+    /** Release ownership of the coroutine frame.
+
+        @note If the caller intends to call `destroy()` on the
+        released handle, it must do so only when the quitter has not
+        started or has fully completed. Destroying a suspended
+        quitter that is being awaited produces undefined behavior.
+    */
+    void release() noexcept
+    {
+        h_ = nullptr;
+    }
+
+    quitter(quitter const&) = delete;
+    quitter& operator=(quitter const&) = delete;
+
+    /// Construct by moving, transferring ownership.
+    quitter(quitter&& other) noexcept
+        : h_(std::exchange(other.h_, nullptr))
+    {
+    }
+
+    /// Assign by moving, transferring ownership.
+    quitter& operator=(quitter&& other) noexcept
+    {
+        if(this != &other)
+        {
+            if(h_)
+                h_.destroy();
+            h_ = std::exchange(other.h_, nullptr);
+        }
+        return *this;
+    }
+
+private:
+    explicit quitter(std::coroutine_handle<promise_type> h)
+        : h_(h)
+    {
+    }
+};
+
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/include/boost/capy/read.hpp b/include/boost/capy/read.hpp
index c9ba125cf..4a85aa9cd 100644
--- a/include/boost/capy/read.hpp
+++ b/include/boost/capy/read.hpp
@@ -14,7 +14,7 @@
 #include <boost/capy/cond.hpp>
 #include <boost/capy/io_task.hpp>
 #include <boost/capy/buffers.hpp>
-#include <boost/capy/buffers/consuming_buffers.hpp>
+#include <boost/capy/buffers/buffer_slice.hpp>
 #include <boost/capy/concept/dynamic_buffer.hpp>
 #include <boost/capy/concept/read_source.hpp>
 #include <boost/capy/concept/read_stream.hpp>
@@ -25,115 +25,146 @@
 namespace boost {
 namespace capy {
 
-/** Asynchronously read until the buffer sequence is full.
+/** Read data from a stream until the buffer sequence is full.
 
-    Reads data from the stream by calling `read_some` repeatedly
-    until the entire buffer sequence is filled or an error occurs.
+    @par Await-effects
 
-    @li The operation completes when:
-    @li The buffer sequence is completely filled
-    @li An error occurs (including `cond::eof`)
-    @li The operation is cancelled
+    Reads data from `stream` via awaiting `stream.read_some` repeatedly
+    until:
 
-    @par Cancellation
-    Supports cancellation via `stop_token` propagated through the
-    IoAwaitable protocol. When cancelled, returns with `cond::canceled`.
+    @li either the entire buffer sequence  @c buffers is filled,
+    @li or a contingency occurs.
 
-    @param stream The stream to read from. The caller retains ownership.
-    @param buffers The buffer sequence to fill. The caller retains
-        ownership and must ensure validity until the operation completes.
+    If `buffer_size(buffers) == 0` then no awaiting `stream.read_some`
+    is performed. This is not a contingency.
+
+    @par Await-returns
+    An object of type `io_result<std::size_t>` destructuring as `[ec, n]`.
+
+    Upon a contingency, `n` represents the number of bytes read so far,
+    inclusive of the last partial read.
+
+    Contingencies:
+
+    @li The first contingency reported from awaiting @c stream.read_some .
+
+    Notable conditions:
+
+    @li @c cond::canceled — Operation was cancelled,
+    @li @c cond::eof — Stream reached end before `buffers` was filled.
+
+    @par Await-postcondition
+    `ec || n == buffer_size(buffers)`.
+
+    @param stream The stream to read from. If the lifetime of `stream` ends
+    before the coroutine finishes, the behavior is undefined.
+
+    @param buffers The buffer sequence to fill. If the lifetime of the buffer
+    sequence represented by `buffers` ends before the coroutine finishes, the behavior is undefined.
+
+
+    @par Remarks
+    Supports _IoAwaitable cancellation_.
 
-    @return An awaitable yielding `(error_code, std::size_t)`.
-        On success, `n` equals `buffer_size(buffers)`. On error,
-        `n` is the number of bytes read before the error. Compare
-        error codes to conditions:
-        @li `cond::eof` - Stream reached end before buffer was filled
-        @li `cond::canceled` - Operation was cancelled
 
     @par Example
 
     @code
-    task<> read_message( ReadStream auto& stream )
+    capy::task<> process_message(capy::ReadStream auto& stream)
     {
-        char header[16];
-        auto [ec, n] = co_await read( stream, mutable_buffer( header ) );
-        if( ec == cond::eof )
+        std::vector<char> header(16);  // known header size for some protocol
+        auto [ec, n] = co_await capy::read(stream, capy::mutable_buffer(header));
+        if (ec == capy::cond::eof)
             co_return;  // Connection closed
-        if( ec )
-            detail::throw_system_error( ec );
-        // header contains exactly 16 bytes
+        if (ec)
+            throw std::system_error(ec);
+
+        // at this point `header` contains exactly 16 bytes
     }
     @endcode
 
-    @see read_some, ReadStream, MutableBufferSequence
+    @see ReadStream, MutableBufferSequence
 */
+template <typename S, typename MB>
+  requires ReadStream<S> && MutableBufferSequence<MB>
 auto
-read(
-    ReadStream auto& stream,
-    MutableBufferSequence auto const& buffers) ->
+read(S& stream, MB buffers) ->
         io_task<std::size_t>
 {
-    consuming_buffers consuming(buffers);
+    auto consuming = buffer_slice(buffers);
     std::size_t const total_size = buffer_size(buffers);
     std::size_t total_read = 0;
 
     while(total_read < total_size)
     {
-        auto [ec, n] = co_await stream.read_some(consuming);
+        auto [ec, n] = co_await stream.read_some(consuming.data());
+        consuming.remove_prefix(n);
+        total_read += n;
         if(ec)
             co_return {ec, total_read};
-        consuming.consume(n);
-        total_read += n;
     }
 
     co_return {{}, total_read};
 }
 
-/** Asynchronously read all data from a stream into a dynamic buffer.
+/** Read all data from a stream into a dynamic buffer.
+
+    @par Await-effects
 
-    Reads data by calling `read_some` repeatedly until EOF is reached
-    or an error occurs. Data is appended using prepare/commit semantics.
+    Reads data from `stream` via awaiting `stream.read_some` repeatedly
+    and appending the results to `dynbuf`,
+    until a contingency occurs.
+
+    Data is appended using prepare/commit semantics.
     The buffer grows with 1.5x factor when filled.
 
-    @li The operation completes when:
-    @li End-of-stream is reached (`cond::eof`)
-    @li An error occurs
-    @li The operation is cancelled
+    @par Await-returns
+
+    An object of type `io_result<std::size_t>` destructuring as `[ec, n]`.
+
+    `n` represents the total number of bytes read,
+    inclusive of the last partial read.
+
+    Contingencies:
 
-    @par Cancellation
-    Supports cancellation via `stop_token` propagated through the
-    IoAwaitable protocol. When cancelled, returns with `cond::canceled`.
+    @li The first contingency, other than one matching to @c cond::eof, reported from awaiting @c stream.read_some .
+
+    @par Await-throws
+    `std::bad_alloc` when append to `dynbuf` fails.
+
+    @param stream The stream to read from. If the lifetime of `stream` ends
+    before the coroutine finishes, the behavior is undefined.
+
+    @param dynbuf The dynamic buffer to append data to. If the lifetime of the buffer
+    sequence represented by `dynbuf` ends before the coroutine finishes, the behavior is undefined.
 
-    @param stream The stream to read from. The caller retains ownership.
-    @param buffers The dynamic buffer to append data to. Must remain
-        valid until the operation completes.
     @param initial_amount Initial bytes to prepare (default 2048).
 
-    @return An awaitable yielding `(error_code, std::size_t)`.
-        On success (EOF), `ec` is clear and `n` is total bytes read.
-        On error, `n` is bytes read before the error. Compare error
-        codes to conditions:
-        @li `cond::canceled` - Operation was cancelled
+    
+    @par Remarks
+    Supports _IoAwaitable cancellation_.
 
     @par Example
 
     @code
-    task<std::string> read_body( ReadStream auto& stream )
+    capy::task<std::string> read_body(capy::ReadStream auto& stream)
     {
         std::string body;
-        auto [ec, n] = co_await read( stream, string_dynamic_buffer( &body ) );
-        if( ec )
-            detail::throw_system_error( ec );
+        auto [ec, n] = co_await capy::read(stream, capy::dynamic_buffer(body));
+        if (ec)
+            throw std::system_error(ec);
         return body;
     }
     @endcode
 
     @see read_some, ReadStream, DynamicBufferParam
 */
+template <typename S, typename DB>
+  requires ReadStream<S> && DynamicBufferParam<DB>
 auto
 read(
-    ReadStream auto& stream,
-    DynamicBufferParam auto&& buffers,
+    S& stream,
+    DB&& dynbuf,
     std::size_t initial_amount = 2048) ->
         io_task<std::size_t>
 {
@@ -141,10 +172,10 @@ read(
     std::size_t total_read = 0;
     for(;;)
     {
-        auto mb = buffers.prepare(amount);
+        auto mb = dynbuf.prepare(amount);
         auto const mb_size = buffer_size(mb);
         auto [ec, n] = co_await stream.read_some(mb);
-        buffers.commit(n);
+        dynbuf.commit(n);
         total_read += n;
         if(ec == cond::eof)
             co_return {{}, total_read};
@@ -155,51 +186,66 @@ read(
     }
 }
 
-/** Asynchronously read all data from a source into a dynamic buffer.
+/** Read all data from a source into a dynamic buffer.
+
+    @par Await-effects
 
-    Reads data by calling `source.read` repeatedly until EOF is reached
-    or an error occurs. Data is appended using prepare/commit semantics.
+    Reads data from `stream` by calling `source.read` repeatedly 
+    and appending it to `dynbuf` until a contingency occurs.
+    The last, potenitally partial, read is also appended.
+    
+    Data is appended using prepare/commit semantics.
     The buffer grows with 1.5x factor when filled.
 
-    @li The operation completes when:
-    @li End-of-stream is reached (`cond::eof`)
-    @li An error occurs
-    @li The operation is cancelled
+    @par Await-returns
+
+    An object of type `io_result<std::size_t>` destructuring as `[ec, n]`.
+
+    `n` represents the total number of bytes read,
+    inclusive of the last partial read.
+
+
+    Contingencies:
+
+    @li The first contingency, other than one matching to @c cond::eof, reported from awaiting @c stream.read_some .
+
+    @par Await-throws
+
+    `std::bad_alloc` when append to `dynbuf` fails.
+
+    @param source The source to read from. If the lifetime of `source` ends
+    before the coroutine finishes, the behavior is undefined.
 
-    @par Cancellation
-    Supports cancellation via `stop_token` propagated through the
-    IoAwaitable protocol. When cancelled, returns with `cond::canceled`.
+    @param dynbuf The dynamic buffer to append data to. If the lifetime of the 
+    buffer sequence represented by `dynbuf` ends before the coroutine finishes, 
+    the behavior is undefined.
 
-    @param source The source to read from. The caller retains ownership.
-    @param buffers The dynamic buffer to append data to. Must remain
-        valid until the operation completes.
     @param initial_amount Initial bytes to prepare (default 2048).
 
-    @return An awaitable yielding `(error_code, std::size_t)`.
-        On success (EOF), `ec` is clear and `n` is total bytes read.
-        On error, `n` is bytes read before the error. Compare error
-        codes to conditions:
-        @li `cond::canceled` - Operation was cancelled
+    @par Remarks
+    Supports _IoAwaitable cancellation_.
 
     @par Example
 
     @code
-    task<std::string> read_body( ReadSource auto& source )
+    capy::task<std::string> read_body(capy::ReadSource auto& source)
     {
         std::string body;
-        auto [ec, n] = co_await read( source, string_dynamic_buffer( &body ) );
-        if( ec )
-            detail::throw_system_error( ec );
+        auto [ec, n] = co_await capy::read(source, capy::dynamic_buffer(body));
+        if (ec)
+            throw std::system_error(ec);
         return body;
     }
     @endcode
 
     @see ReadSource, DynamicBufferParam
 */
+template <typename S, typename DB>
+  requires ReadSource<S> && DynamicBufferParam<DB>
 auto
 read(
-    ReadSource auto& source,
-    DynamicBufferParam auto&& buffers,
+    S& source,
+    DB&& dynbuf,
     std::size_t initial_amount = 2048) ->
         io_task<std::size_t>
 {
@@ -207,10 +253,10 @@ read(
     std::size_t total_read = 0;
     for(;;)
     {
-        auto mb = buffers.prepare(amount);
+        auto mb = dynbuf.prepare(amount);
         auto const mb_size = buffer_size(mb);
         auto [ec, n] = co_await source.read(mb);
-        buffers.commit(n);
+        dynbuf.commit(n);
         total_read += n;
         if(ec == cond::eof)
             co_return {{}, total_read};
diff --git a/include/boost/capy/read_until.hpp b/include/boost/capy/read_until.hpp
index 978830663..f2a3b95e6 100644
--- a/include/boost/capy/read_until.hpp
+++ b/include/boost/capy/read_until.hpp
@@ -205,8 +205,22 @@ struct read_until_awaitable
 */
 struct match_delim
 {
+    /** The delimiter string to search for.
+
+        @note The referenced characters must remain valid
+            for the lifetime of this object and any pending
+            read operation.
+    */
     std::string_view delim;
 
+    /** Search for the delimiter in `data`.
+
+        @param data The data to search.
+        @param hint If non-null, receives the overlap hint
+            on miss.
+        @return `0` if `delim` is empty; otherwise the position
+            just past the delimiter, or `npos` if not found.
+    */
     std::size_t
     operator()(
         std::string_view data,
@@ -248,7 +262,7 @@ struct match_delim
     @param initial_amount Initial bytes to read per iteration (default
         2048). Grows by 1.5x when filled.
 
-    @return An awaitable yielding `(error_code, std::size_t)`.
+    @return An awaitable that await-returns `(error_code, std::size_t)`.
         On success, `n` is the position returned by the match condition
         (bytes up to and including the matched delimiter). Compare error
         codes to conditions:
@@ -326,7 +340,7 @@ read_until(
     @param initial_amount Initial bytes to read per iteration (default
         2048). Grows by 1.5x when filled.
 
-    @return An awaitable yielding `(error_code, std::size_t)`.
+    @return An awaitable that await-returns `(error_code, std::size_t)`.
         On success, `n` is bytes up to and including the delimiter.
         Compare error codes to conditions:
         @li `cond::eof` - EOF before delimiter; `n` is buffer size
diff --git a/include/boost/capy/task.hpp b/include/boost/capy/task.hpp
index b50cfc46e..ce2a5b556 100644
--- a/include/boost/capy/task.hpp
+++ b/include/boost/capy/task.hpp
@@ -4,7 +4,7 @@
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 //
-// Official repository: https://github.com/cppalliance/corosio
+// Official repository: https://github.com/cppalliance/capy
 //
 
 #ifndef BOOST_CAPY_TASK_HPP
@@ -16,6 +16,7 @@
 #include <boost/capy/ex/io_awaitable_promise_base.hpp>
 #include <boost/capy/ex/io_env.hpp>
 #include <boost/capy/ex/frame_allocator.hpp>
+#include <boost/capy/detail/await_suspend_helper.hpp>
 
 #include <exception>
 #include <optional>
@@ -67,8 +68,6 @@ struct task_return_base<void>
     to nested `co_await` expressions. This enables cancellation and
     proper completion dispatch across executor boundaries.
 
-    @tparam T The result type. Use `task<>` for `task<void>`.
-
     @par Thread Safety
     Distinct objects: Safe.
     Shared objects: Unsafe.
@@ -91,6 +90,8 @@ struct task_return_base<void>
     }
     @endcode
 
+    @tparam T The result type. Use `task<>` for `task<void>`.
+
     @see IoRunnable, IoAwaitable, run, run_async
 */
 template<typename T = void>
@@ -177,7 +178,7 @@ struct [[nodiscard]] BOOST_CAPY_CORO_AWAIT_ELIDABLE
             return awaiter{this};
         }
 
-        void unhandled_exception()
+        void unhandled_exception() noexcept
         {
             new (&ep_) std::exception_ptr(std::current_exception());
             has_ep_ = true;
@@ -204,7 +205,11 @@ struct [[nodiscard]] BOOST_CAPY_CORO_AWAIT_ELIDABLE
             template<class Promise>
             auto await_suspend(std::coroutine_handle<Promise> h) noexcept
             {
-                return a_.await_suspend(h, p_->environment());
+                using R = decltype(a_.await_suspend(h, p_->environment()));
+                if constexpr (std::is_same_v<R, std::coroutine_handle<>>)
+                    return detail::symmetric_transfer(a_.await_suspend(h, p_->environment()));
+                else
+                    return a_.await_suspend(h, p_->environment());
             }
         };
 
@@ -258,7 +263,17 @@ struct [[nodiscard]] BOOST_CAPY_CORO_AWAIT_ELIDABLE
         return h_;
     }
 
-    /// Return the coroutine handle.
+    /** Return the coroutine handle.
+
+        @note Do not call `destroy()` on the returned handle while the
+        task is being awaited. The task's lifetime is normally managed
+        by `run_async`, `run`, or the awaiting parent; manually
+        destroying a suspended task that another coroutine is awaiting
+        produces undefined behavior. For cooperative cancellation, use
+        `std::stop_token`.
+
+        @return The coroutine handle.
+    */
     std::coroutine_handle<promise_type> handle() const noexcept
     {
         return h_;
@@ -270,6 +285,11 @@ struct [[nodiscard]] BOOST_CAPY_CORO_AWAIT_ELIDABLE
         coroutine frame. The caller becomes responsible for the frame's
         lifetime.
 
+        @note If the caller intends to call `destroy()` on the
+        released handle, it must do so only when the task has not
+        started or has fully completed. Destroying a suspended task
+        that is being awaited produces undefined behavior.
+
         @par Postconditions
         `handle()` returns the original handle, but the task no longer
         owns it.
@@ -282,13 +302,13 @@ struct [[nodiscard]] BOOST_CAPY_CORO_AWAIT_ELIDABLE
     task(task const&) = delete;
     task& operator=(task const&) = delete;
 
-    /// Move construct, transferring ownership.
+    /// Construct by moving, transferring ownership.
     task(task&& other) noexcept
         : h_(std::exchange(other.h_, nullptr))
     {
     }
 
-    /// Move assign, transferring ownership.
+    /// Assign by moving, transferring ownership.
     task& operator=(task&& other) noexcept
     {
         if(this != &other)
diff --git a/include/boost/capy/test/buffer_sink.hpp b/include/boost/capy/test/buffer_sink.hpp
index 8bb3fde74..516c95977 100644
--- a/include/boost/capy/test/buffer_sink.hpp
+++ b/include/boost/capy/test/buffer_sink.hpp
@@ -150,7 +150,7 @@ class buffer_sink
 
         @param n The number of bytes to commit.
 
-        @return An awaitable yielding `(error_code)`.
+        @return An awaitable that await-returns `(error_code)`.
 
         @see fuse
     */
@@ -207,7 +207,7 @@ class buffer_sink
 
         @param n The number of bytes to commit.
 
-        @return An awaitable yielding `(error_code)`.
+        @return An awaitable that await-returns `(error_code)`.
 
         @see fuse
     */
diff --git a/include/boost/capy/test/buffer_source.hpp b/include/boost/capy/test/buffer_source.hpp
index 0fb8a50f7..008b80753 100644
--- a/include/boost/capy/test/buffer_source.hpp
+++ b/include/boost/capy/test/buffer_source.hpp
@@ -139,7 +139,7 @@ class buffer_source
 
         @param dest Span of const_buffer to fill.
 
-        @return An awaitable yielding `(error_code,std::span<const_buffer>)`.
+        @return An awaitable that await-returns `(error_code,std::span<const_buffer>)`.
 
         @see consume, fuse
     */
diff --git a/include/boost/capy/test/bufgrind.hpp b/include/boost/capy/test/bufgrind.hpp
index c5f2bb650..48446ad76 100644
--- a/include/boost/capy/test/bufgrind.hpp
+++ b/include/boost/capy/test/bufgrind.hpp
@@ -12,12 +12,13 @@
 
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/buffers.hpp>
-#include <boost/capy/buffers/slice.hpp>
+#include <boost/capy/buffers/buffer_slice.hpp>
 #include <coroutine>
 #include <boost/capy/ex/io_env.hpp>
 
 #include <algorithm>
 #include <cstddef>
+#include <type_traits>
 #include <utility>
 
 namespace boost {
@@ -32,9 +33,12 @@ namespace test {
     that allows `co_await` between iterations.
 
     The split type automatically preserves mutability: passing a
-    `MutableBufferSequence` yields mutable slices, while passing a
-    `ConstBufferSequence` yields const slices. This is handled
-    automatically through `slice_type<BS>`.
+    `MutableBufferSequence` yields halves that model
+    @ref MutableBufferSequence, while passing a `ConstBufferSequence`
+    yields halves that model @ref ConstBufferSequence. Each half is
+    the buffer-sequence view exposed by a @ref buffer_slice over the
+    corresponding byte range, and can be passed directly to
+    `read_some`, `write_some`, `buffer_size`, etc.
 
     @par Thread Safety
     Not thread-safe.
@@ -50,8 +54,8 @@ namespace test {
         bufgrind bg( cb );
         while( bg ) {
             auto [b1, b2] = co_await bg.next();
-            // b1 contains first N bytes
-            // b2 contains remaining bytes
+            // b1 contains first N bytes (as a buffer sequence)
+            // b2 contains remaining bytes (as a buffer sequence)
             // concatenating b1 + b2 equals original
             co_await some_async_operation( b1, b2 );
         }
@@ -81,7 +85,7 @@ namespace test {
     }
     @endcode
 
-    @see prefix, sans_prefix, slice_type
+    @see buffer_slice
 */
 template<ConstBufferSequence BS>
 class bufgrind
@@ -92,8 +96,13 @@ class bufgrind
     std::size_t pos_ = 0;
 
 public:
-    /// The type returned by @ref next.
-    using split_type = std::pair<slice_type<BS>, slice_type<BS>>;
+    /// The slice type produced for each half of a split.
+    using slice_type = std::decay_t<
+        decltype(buffer_slice(std::declval<BS const&>()))>;
+
+    /// The type returned by @ref next. Each half is a Slice; use
+    /// `.data()` to obtain the buffer sequence view.
+    using split_type = std::pair<slice_type, slice_type>;
 
     /** Construct a buffer grinder.
 
@@ -135,13 +144,15 @@ class bufgrind
         split_type
         await_resume()
         {
-            auto b1 = prefix(self_->bs_, self_->pos_);
-            auto b2 = sans_prefix(self_->bs_, self_->pos_);
+            split_type result{
+                buffer_slice(self_->bs_, 0, self_->pos_),
+                buffer_slice(self_->bs_, self_->pos_)
+            };
             if(self_->pos_ < self_->size_)
                 self_->pos_ = (std::min)(self_->pos_ + self_->step_, self_->size_);
             else
                 ++self_->pos_;
-            return {std::move(b1), std::move(b2)};
+            return result;
         }
     };
 
@@ -153,7 +164,7 @@ class bufgrind
         @par Preconditions
         `static_cast<bool>( *this )` is `true`.
 
-        @return An awaitable yielding `split_type`.
+        @return An awaitable that await-returns `split_type`.
     */
     next_awaitable
     next() noexcept
diff --git a/include/boost/capy/test/read_source.hpp b/include/boost/capy/test/read_source.hpp
index cfa23233e..4ab265a30 100644
--- a/include/boost/capy/test/read_source.hpp
+++ b/include/boost/capy/test/read_source.hpp
@@ -121,7 +121,7 @@ class read_source
 
         @param buffers The mutable buffer sequence to receive data.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @see fuse
     */
@@ -181,7 +181,7 @@ class read_source
 
         @param buffers The mutable buffer sequence to receive data.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @see fuse
     */
diff --git a/include/boost/capy/test/read_stream.hpp b/include/boost/capy/test/read_stream.hpp
index 7fbba8b92..d31143507 100644
--- a/include/boost/capy/test/read_stream.hpp
+++ b/include/boost/capy/test/read_stream.hpp
@@ -127,7 +127,7 @@ class read_stream
 
         @param buffers The mutable buffer sequence to receive data.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @see fuse
     */
diff --git a/include/boost/capy/test/run_blocking.hpp b/include/boost/capy/test/run_blocking.hpp
index 0e63fd6dd..14667c87d 100644
--- a/include/boost/capy/test/run_blocking.hpp
+++ b/include/boost/capy/test/run_blocking.hpp
@@ -77,12 +77,12 @@ struct BOOST_CAPY_DECL blocking_executor
         Returns the handle for symmetric transfer. The caller
         resumes the coroutine via the returned handle.
 
-        @param h The coroutine handle to execute.
+        @param c The continuation to execute.
 
-        @return `h` for symmetric transfer.
+        @return `c.h` for symmetric transfer.
     */
     std::coroutine_handle<>
-    dispatch(std::coroutine_handle<> h) const;
+    dispatch(continuation& c) const;
 
     /** Post work for deferred execution.
 
@@ -90,10 +90,10 @@ struct BOOST_CAPY_DECL blocking_executor
         queue. The handle is resumed when the blocking event
         loop processes it.
 
-        @param h The coroutine handle to enqueue.
+        @param c The continuation to enqueue.
     */
     void
-    post(std::coroutine_handle<> h) const;
+    post(continuation& c) const;
 
 private:
     blocking_context* ctx_;
diff --git a/include/boost/capy/test/stream.hpp b/include/boost/capy/test/stream.hpp
index 660a7bb02..32078e678 100644
--- a/include/boost/capy/test/stream.hpp
+++ b/include/boost/capy/test/stream.hpp
@@ -14,6 +14,7 @@
 #include <boost/capy/buffers.hpp>
 #include <boost/capy/buffers/buffer_copy.hpp>
 #include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/continuation.hpp>
 #include <coroutine>
 #include <boost/capy/ex/io_env.hpp>
 #include <boost/capy/io_result.hpp>
@@ -86,7 +87,7 @@ class stream
     {
         std::string buf;
         std::size_t max_read_size = std::size_t(-1);
-        std::coroutine_handle<> pending_h{};
+        continuation pending_cont_;
         executor_ref pending_ex;
         bool eof = false;
     };
@@ -109,13 +110,11 @@ class stream
             closed = true;
             for(auto& side : sides)
             {
-                if(side.pending_h)
+                if(side.pending_cont_.h)
                 {
-                    auto h = side.pending_h;
-                    side.pending_h = {};
-                    auto ex = side.pending_ex;
+                    side.pending_ex.post(side.pending_cont_);
+                    side.pending_cont_.h = {};
                     side.pending_ex = {};
-                    ex.post(h);
                 }
             }
         }
@@ -167,13 +166,11 @@ class stream
         int peer = 1 - index_;
         auto& side = state_->sides[peer];
         side.eof = true;
-        if(side.pending_h)
+        if(side.pending_cont_.h)
         {
-            auto h = side.pending_h;
-            side.pending_h = {};
-            auto ex = side.pending_ex;
+            side.pending_ex.post(side.pending_cont_);
+            side.pending_cont_.h = {};
             side.pending_ex = {};
-            ex.post(h);
         }
     }
 
@@ -205,7 +202,7 @@ class stream
 
         @param buffers The mutable buffer sequence to receive data.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @see fuse, close
     */
@@ -234,7 +231,7 @@ class stream
             {
                 auto& side = self_->state_->sides[
                     self_->index_];
-                side.pending_h = h;
+                side.pending_cont_.h = h;
                 side.pending_ex = env->executor;
                 return std::noop_coroutine();
             }
@@ -288,7 +285,7 @@ class stream
         @param buffers The const buffer sequence containing
             data to write.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @see fuse, close
     */
@@ -336,13 +333,11 @@ class stream
                     side.buf.data() + old_size, n),
                     buffers_, n);
 
-                if(side.pending_h)
+                if(side.pending_cont_.h)
                 {
-                    auto h = side.pending_h;
-                    side.pending_h = {};
-                    auto ex = side.pending_ex;
+                    side.pending_ex.post(side.pending_cont_);
+                    side.pending_cont_.h = {};
                     side.pending_ex = {};
-                    ex.post(h);
                 }
 
                 return {{}, n};
@@ -368,13 +363,11 @@ class stream
         int peer = 1 - index_;
         auto& side = state_->sides[peer];
         side.buf.append(sv);
-        if(side.pending_h)
+        if(side.pending_cont_.h)
         {
-            auto h = side.pending_h;
-            side.pending_h = {};
-            auto ex = side.pending_ex;
+            side.pending_ex.post(side.pending_cont_);
+            side.pending_cont_.h = {};
             side.pending_ex = {};
-            ex.post(h);
         }
     }
 
diff --git a/include/boost/capy/test/write_sink.hpp b/include/boost/capy/test/write_sink.hpp
index 4bc4f6097..319342692 100644
--- a/include/boost/capy/test/write_sink.hpp
+++ b/include/boost/capy/test/write_sink.hpp
@@ -155,7 +155,7 @@ class write_sink
 
         @param buffers The const buffer sequence containing data to write.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @see fuse
     */
@@ -216,7 +216,7 @@ class write_sink
 
         @param buffers The const buffer sequence containing data to write.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @see fuse
     */
@@ -281,7 +281,7 @@ class write_sink
 
         @param buffers The const buffer sequence containing data to write.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @see fuse
     */
@@ -343,7 +343,7 @@ class write_sink
         @par Exception Safety
         No-throw guarantee.
 
-        @return An awaitable yielding `(error_code)`.
+        @return An awaitable that await-returns `(error_code)`.
 
         @see fuse
     */
diff --git a/include/boost/capy/test/write_stream.hpp b/include/boost/capy/test/write_stream.hpp
index e6f08b402..6e36abd3a 100644
--- a/include/boost/capy/test/write_stream.hpp
+++ b/include/boost/capy/test/write_stream.hpp
@@ -144,7 +144,7 @@ class write_stream
 
         @param buffers The const buffer sequence containing data to write.
 
-        @return An awaitable yielding `(error_code,std::size_t)`.
+        @return An awaitable that await-returns `(error_code,std::size_t)`.
 
         @see fuse
     */
diff --git a/include/boost/capy/timeout.hpp b/include/boost/capy/timeout.hpp
new file mode 100644
index 000000000..d6132bf07
--- /dev/null
+++ b/include/boost/capy/timeout.hpp
@@ -0,0 +1,232 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_TIMEOUT_HPP
+#define BOOST_CAPY_TIMEOUT_HPP
+
+#include <boost/capy/detail/config.hpp>
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/delay.hpp>
+#include <boost/capy/detail/io_result_combinators.hpp>
+#include <boost/capy/error.hpp>
+#include <boost/capy/io_result.hpp>
+#include <boost/capy/task.hpp>
+#include <boost/capy/when_all.hpp>
+
+#include <atomic>
+#include <chrono>
+#include <exception>
+#include <optional>
+
+namespace boost {
+namespace capy {
+namespace detail {
+
+template<typename T>
+struct timeout_state
+{
+    when_all_core core_;
+    std::atomic<int> winner_{-1}; // -1=none, 0=inner, 1=delay
+    std::optional<T> inner_result_;
+    std::exception_ptr inner_exception_;
+    std::array<continuation, 2> runner_handles_{};
+
+    timeout_state()
+        : core_(2)
+    {
+    }
+};
+
+template<IoAwaitable Awaitable, typename T>
+when_all_runner<timeout_state<T>>
+make_timeout_inner_runner(
+    Awaitable inner, timeout_state<T>* state)
+{
+    try
+    {
+        auto result = co_await std::move(inner);
+        state->inner_result_.emplace(std::move(result));
+    }
+    catch(...)
+    {
+        state->inner_exception_ = std::current_exception();
+    }
+
+    int expected = -1;
+    if(state->winner_.compare_exchange_strong(
+        expected, 0, std::memory_order_relaxed))
+        state->core_.stop_source_.request_stop();
+}
+
+template<typename DelayAw, typename T>
+when_all_runner<timeout_state<T>>
+make_timeout_delay_runner(
+    DelayAw d, timeout_state<T>* state)
+{
+    auto result = co_await std::move(d);
+
+    if(!result.ec)
+    {
+        int expected = -1;
+        if(state->winner_.compare_exchange_strong(
+            expected, 1, std::memory_order_relaxed))
+            state->core_.stop_source_.request_stop();
+    }
+}
+
+template<IoAwaitable Inner, typename DelayAw, typename T>
+class timeout_launcher
+{
+    Inner* inner_;
+    DelayAw* delay_;
+    timeout_state<T>* state_;
+
+public:
+    timeout_launcher(
+        Inner* inner, DelayAw* delay,
+        timeout_state<T>* state)
+        : inner_(inner)
+        , delay_(delay)
+        , state_(state)
+    {
+    }
+
+    bool await_ready() const noexcept { return false; }
+
+    std::coroutine_handle<> await_suspend(
+        std::coroutine_handle<> continuation,
+        io_env const* caller_env)
+    {
+        state_->core_.continuation_.h = continuation;
+        state_->core_.caller_env_ = caller_env;
+
+        if(caller_env->stop_token.stop_possible())
+        {
+            state_->core_.parent_stop_callback_.emplace(
+                caller_env->stop_token,
+                when_all_core::stop_callback_fn{
+                    &state_->core_.stop_source_});
+
+            if(caller_env->stop_token.stop_requested())
+                state_->core_.stop_source_.request_stop();
+        }
+
+        auto token = state_->core_.stop_source_.get_token();
+
+        auto r0 = make_timeout_inner_runner(
+            std::move(*inner_), state_);
+        auto h0 = r0.release();
+        h0.promise().state_ = state_;
+        h0.promise().env_ = io_env{
+            caller_env->executor, token,
+            caller_env->frame_allocator};
+        state_->runner_handles_[0].h =
+            std::coroutine_handle<>{h0};
+
+        auto r1 = make_timeout_delay_runner(
+            std::move(*delay_), state_);
+        auto h1 = r1.release();
+        h1.promise().state_ = state_;
+        h1.promise().env_ = io_env{
+            caller_env->executor, token,
+            caller_env->frame_allocator};
+        state_->runner_handles_[1].h =
+            std::coroutine_handle<>{h1};
+
+        caller_env->executor.post(
+            state_->runner_handles_[0]);
+        caller_env->executor.post(
+            state_->runner_handles_[1]);
+
+        return std::noop_coroutine();
+    }
+
+    void await_resume() const noexcept {}
+};
+
+} // namespace detail
+
+/** Race an io_result-returning awaitable against a deadline.
+
+    Starts the awaitable and a timer concurrently. The first to
+    complete wins and cancels the other. If the awaitable finishes
+    first, its result is returned as-is (success, error, or
+    exception). If the timer fires first, an `io_result` with
+    `ec == error::timeout` is produced.
+
+    Unlike @ref when_any, exceptions from the inner awaitable
+    are always propagated — they are never swallowed by the timer.
+
+    @par Return Type
+
+    Always returns `io_result<Ts...>` matching the inner
+    awaitable's result type. On timeout, `ec` is set to
+    `error::timeout` and payload values are default-initialized.
+
+    @par Precision
+
+    The timeout fires at or after the specified duration.
+
+    @par Cancellation
+
+    If the parent's stop token is activated, both children are
+    cancelled. The inner awaitable's cancellation result is
+    returned.
+
+    @par Example
+    @code
+    auto [ec, n] = co_await timeout(sock.read_some(buf), 50ms);
+    if (ec == cond::timeout) {
+        // handle timeout
+    }
+    @endcode
+
+    @tparam A An IoAwaitable returning `io_result<Ts...>`.
+
+    @param a The awaitable to race against the deadline.
+    @param dur The maximum duration to wait.
+
+    @return `task<awaitable_result_t<A>>`.
+
+    @throws Rethrows any exception from the inner awaitable,
+        regardless of whether the timer has fired.
+
+    @see delay, cond::timeout
+*/
+template<IoAwaitable A, typename Rep, typename Period>
+    requires detail::is_io_result_v<awaitable_result_t<A>>
+auto timeout(A a, std::chrono::duration<Rep, Period> dur)
+    -> task<awaitable_result_t<A>>
+{
+    using T = awaitable_result_t<A>;
+
+    auto d = delay(dur);
+    detail::timeout_state<T> state;
+
+    co_await detail::timeout_launcher<
+        A, decltype(d), T>(&a, &d, &state);
+
+    if(state.core_.first_exception_)
+        std::rethrow_exception(state.core_.first_exception_);
+    if(state.inner_exception_)
+        std::rethrow_exception(state.inner_exception_);
+
+    if(state.winner_.load(std::memory_order_relaxed) == 0)
+        co_return std::move(*state.inner_result_);
+
+    // Delay fired first: timeout
+    T r{};
+    r.ec = make_error_code(error::timeout);
+    co_return r;
+}
+
+} // capy
+} // boost
+
+#endif
diff --git a/include/boost/capy/when_all.hpp b/include/boost/capy/when_all.hpp
index bd539f23c..a613d0bc9 100644
--- a/include/boost/capy/when_all.hpp
+++ b/include/boost/capy/when_all.hpp
@@ -11,9 +11,12 @@
 #define BOOST_CAPY_WHEN_ALL_HPP
 
 #include <boost/capy/detail/config.hpp>
+#include <boost/capy/detail/io_result_combinators.hpp>
+#include <boost/capy/continuation.hpp>
 #include <boost/capy/concept/executor.hpp>
 #include <boost/capy/concept/io_awaitable.hpp>
 #include <coroutine>
+#include <boost/capy/ex/frame_alloc_mixin.hpp>
 #include <boost/capy/ex/io_env.hpp>
 #include <boost/capy/ex/frame_allocator.hpp>
 #include <boost/capy/task.hpp>
@@ -21,30 +24,21 @@
 #include <array>
 #include <atomic>
 #include <exception>
+#include <memory>
 #include <optional>
+#include <ranges>
+#include <stdexcept>
 #include <stop_token>
 #include <tuple>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 namespace boost {
 namespace capy {
 
 namespace detail {
 
-/** Type trait to filter void types from a tuple.
-
-    Void-returning tasks do not contribute a value to the result tuple.
-    This trait computes the filtered result type.
-
-    Example: filter_void_tuple_t<int, void, string> = tuple<int, string>
-*/
-template<typename T>
-using wrap_non_void_t = std::conditional_t<std::is_void_v<T>, std::tuple<>, std::tuple<T>>;
-
-template<typename... Ts>
-using filter_void_tuple_t = decltype(std::tuple_cat(std::declval<wrap_non_void_t<Ts>>()...));
-
 /** Holds the result of a single task within when_all.
 */
 template<typename T>
@@ -63,39 +57,26 @@ struct result_holder
     }
 };
 
-/** Specialization for void tasks - no value storage needed.
-*/
-template<>
-struct result_holder<void>
-{
-};
+/** Core shared state for when_all operations.
 
-/** Shared state for when_all operation.
+    Contains all members and methods common to both heterogeneous (variadic)
+    and homogeneous (range) when_all implementations. State classes embed
+    this via composition to avoid CRTP destructor ordering issues.
 
-    @tparam Ts The result types of the tasks.
+    @par Thread Safety
+    Atomic operations protect exception capture and completion count.
 */
-template<typename... Ts>
-struct when_all_state
+struct when_all_core
 {
-    static constexpr std::size_t task_count = sizeof...(Ts);
-
-    // Completion tracking - when_all waits for all children
     std::atomic<std::size_t> remaining_count_;
 
-    // Result storage in input order
-    std::tuple<result_holder<Ts>...> results_;
-
-    // Runner handles - destroyed in await_resume while allocator is valid
-    std::array<std::coroutine_handle<>, task_count> runner_handles_{};
-
     // Exception storage - first error wins, others discarded
     std::atomic<bool> has_exception_{false};
     std::exception_ptr first_exception_;
 
-    // Stop propagation - on error, request stop for siblings
     std::stop_source stop_source_;
 
-    // Connects parent's stop_token to our stop_source
+    // Bridges parent's stop token to our stop_source
     struct stop_callback_fn
     {
         std::stop_source* source_;
@@ -104,19 +85,15 @@ struct when_all_state
     using stop_callback_t = std::stop_callback<stop_callback_fn>;
     std::optional<stop_callback_t> parent_stop_callback_;
 
-    // Parent resumption
-    std::coroutine_handle<> continuation_;
+    continuation continuation_;
     io_env const* caller_env_ = nullptr;
 
-    when_all_state()
-        : remaining_count_(task_count)
+    explicit when_all_core(std::size_t count) noexcept
+        : remaining_count_(count)
     {
     }
 
-    // Runners self-destruct in final_suspend. No destruction needed here.
-
-    /** Capture an exception (first one wins).
-    */
+    /** Capture an exception (first one wins). */
     void capture_exception(std::exception_ptr ep)
     {
         bool expected = false;
@@ -124,25 +101,126 @@ struct when_all_state
             expected, true, std::memory_order_relaxed))
             first_exception_ = ep;
     }
+};
+
+/** Shared state for heterogeneous when_all (variadic overload).
+
+    @tparam Ts The result types of the tasks.
+*/
+template<typename... Ts>
+struct when_all_state
+{
+    static constexpr std::size_t task_count = sizeof...(Ts);
+
+    when_all_core core_;
+    std::tuple<result_holder<Ts>...> results_;
+    std::array<continuation, task_count> runner_handles_{};
+
+    std::atomic<bool> has_error_{false};
+    std::error_code first_error_;
+
+    when_all_state()
+        : core_(task_count)
+    {
+    }
+
+    /** Record the first error (subsequent errors are discarded). */
+    void record_error(std::error_code ec)
+    {
+        bool expected = false;
+        if(has_error_.compare_exchange_strong(
+            expected, true, std::memory_order_relaxed))
+            first_error_ = ec;
+    }
+};
+
+/** Shared state for homogeneous when_all (range overload).
+
+    Stores extracted io_result payloads in a vector indexed by task
+    position. Tracks the first error_code for error propagation.
+
+    @tparam T The payload type extracted from io_result.
+*/
+template<typename T>
+struct when_all_homogeneous_state
+{
+    when_all_core core_;
+    std::vector<std::optional<T>> results_;
+    std::unique_ptr<continuation[]> runner_handles_;
+
+    std::atomic<bool> has_error_{false};
+    std::error_code first_error_;
+
+    explicit when_all_homogeneous_state(std::size_t count)
+        : core_(count)
+        , results_(count)
+        , runner_handles_(std::make_unique<continuation[]>(count))
+    {
+    }
 
+    void set_result(std::size_t index, T value)
+    {
+        results_[index].emplace(std::move(value));
+    }
+
+    /** Record the first error (subsequent errors are discarded). */
+    void record_error(std::error_code ec)
+    {
+        bool expected = false;
+        if(has_error_.compare_exchange_strong(
+            expected, true, std::memory_order_relaxed))
+            first_error_ = ec;
+    }
+};
+
+/** Specialization for void io_result children (no payload storage). */
+template<>
+struct when_all_homogeneous_state<std::tuple<>>
+{
+    when_all_core core_;
+    std::unique_ptr<continuation[]> runner_handles_;
+
+    std::atomic<bool> has_error_{false};
+    std::error_code first_error_;
+
+    explicit when_all_homogeneous_state(std::size_t count)
+        : core_(count)
+        , runner_handles_(std::make_unique<continuation[]>(count))
+    {
+    }
+
+    /** Record the first error (subsequent errors are discarded). */
+    void record_error(std::error_code ec)
+    {
+        bool expected = false;
+        if(has_error_.compare_exchange_strong(
+            expected, true, std::memory_order_relaxed))
+            first_error_ = ec;
+    }
 };
 
-/** Wrapper coroutine that intercepts task completion.
+/** Wrapper coroutine that intercepts task completion for when_all.
 
-    This runner awaits its assigned task and stores the result in
-    the shared state, or captures the exception and requests stop.
+    Parameterized on StateType to work with both heterogeneous (variadic)
+    and homogeneous (range) state types. All state types expose their
+    shared members through a `core_` member of type when_all_core.
+
+    @tparam StateType The state type (when_all_state or when_all_homogeneous_state).
 */
-template<typename T, typename... Ts>
-struct when_all_runner
+template<typename StateType>
+struct BOOST_CAPY_CORO_DESTROY_WHEN_COMPLETE when_all_runner
 {
-    struct promise_type // : frame_allocating_base  // DISABLED FOR TESTING
+    struct promise_type
+        : frame_alloc_mixin
     {
-        when_all_state<Ts...>* state_ = nullptr;
+        StateType* state_ = nullptr;
+        std::size_t index_ = 0;
         io_env env_;
 
-        when_all_runner get_return_object()
+        when_all_runner get_return_object() noexcept
         {
-            return when_all_runner(std::coroutine_handle<promise_type>::from_promise(*this));
+            return when_all_runner(
+                std::coroutine_handle<promise_type>::from_promise(*this));
         }
 
         std::suspend_always initial_suspend() noexcept
@@ -155,45 +233,32 @@ struct when_all_runner
             struct awaiter
             {
                 promise_type* p_;
-
-                bool await_ready() const noexcept
-                {
-                    return false;
-                }
-
-                std::coroutine_handle<> await_suspend(std::coroutine_handle<> h) noexcept
+                bool await_ready() const noexcept { return false; }
+                auto await_suspend(std::coroutine_handle<> h) noexcept
                 {
-                    // Extract everything needed before self-destruction.
-                    auto* state = p_->state_;
-                    auto* counter = &state->remaining_count_;
-                    auto* caller_env = state->caller_env_;
-                    auto cont = state->continuation_;
+                    auto& core = p_->state_->core_;
+                    auto* counter = &core.remaining_count_;
+                    auto* caller_env = core.caller_env_;
+                    auto& cont = core.continuation_;
 
                     h.destroy();
 
-                    // If last runner, dispatch parent for symmetric transfer.
                     auto remaining = counter->fetch_sub(1, std::memory_order_acq_rel);
                     if(remaining == 1)
-                        return caller_env->executor.dispatch(cont);
-                    return std::noop_coroutine();
-                }
-
-                void await_resume() const noexcept
-                {
+                        return detail::symmetric_transfer(caller_env->executor.dispatch(cont));
+                    return detail::symmetric_transfer(std::noop_coroutine());
                 }
+                void await_resume() const noexcept {}
             };
             return awaiter{this};
         }
 
-        void return_void()
-        {
-        }
+        void return_void() noexcept {}
 
-        void unhandled_exception()
+        void unhandled_exception() noexcept
         {
-            state_->capture_exception(std::current_exception());
-            // Request stop for sibling tasks
-            state_->stop_source_.request_stop();
+            state_->core_.capture_exception(std::current_exception());
+            state_->core_.stop_source_.request_stop();
         }
 
         template<class Awaitable>
@@ -202,20 +267,17 @@ struct when_all_runner
             std::decay_t<Awaitable> a_;
             promise_type* p_;
 
-            bool await_ready()
-            {
-                return a_.await_ready();
-            }
-
-            decltype(auto) await_resume()
-            {
-                return a_.await_resume();
-            }
+            bool await_ready() { return a_.await_ready(); }
+            decltype(auto) await_resume() { return a_.await_resume(); }
 
             template<class Promise>
             auto await_suspend(std::coroutine_handle<Promise> h)
             {
-                return a_.await_suspend(h, &p_->env_);
+                using R = decltype(a_.await_suspend(h, &p_->env_));
+                if constexpr (std::is_same_v<R, std::coroutine_handle<>>)
+                    return detail::symmetric_transfer(a_.await_suspend(h, &p_->env_));
+                else
+                    return a_.await_suspend(h, &p_->env_);
             }
         };
 
@@ -237,15 +299,17 @@ struct when_all_runner
 
     std::coroutine_handle<promise_type> h_;
 
-    explicit when_all_runner(std::coroutine_handle<promise_type> h)
+    explicit when_all_runner(std::coroutine_handle<promise_type> h) noexcept
         : h_(h)
     {
     }
 
     // Enable move for all clang versions - some versions need it
-    when_all_runner(when_all_runner&& other) noexcept : h_(std::exchange(other.h_, nullptr)) {}
+    when_all_runner(when_all_runner&& other) noexcept
+        : h_(std::exchange(other.h_, nullptr))
+    {
+    }
 
-    // Non-copyable
     when_all_runner(when_all_runner const&) = delete;
     when_all_runner& operator=(when_all_runner const&) = delete;
     when_all_runner& operator=(when_all_runner&&) = delete;
@@ -256,32 +320,53 @@ struct when_all_runner
     }
 };
 
-/** Create a runner coroutine for a single awaitable.
+/** Create an io_result-aware runner for a single awaitable (range path).
 
-    Awaitable is passed directly to ensure proper coroutine frame storage.
+    Checks the error code, records errors and requests stop on failure,
+    or extracts the payload on success.
 */
-template<std::size_t Index, IoAwaitable Awaitable, typename... Ts>
-when_all_runner<awaitable_result_t<Awaitable>, Ts...>
-make_when_all_runner(Awaitable inner, when_all_state<Ts...>* state)
+template<IoAwaitable Awaitable, typename StateType>
+when_all_runner<StateType>
+make_when_all_homogeneous_runner(Awaitable inner, StateType* state, std::size_t index)
 {
-    using T = awaitable_result_t<Awaitable>;
-    if constexpr (std::is_void_v<T>)
+    auto result = co_await std::move(inner);
+
+    if(result.ec)
     {
-        co_await std::move(inner);
+        state->record_error(result.ec);
+        state->core_.stop_source_.request_stop();
     }
     else
     {
-        std::get<Index>(state->results_).set(co_await std::move(inner));
+        using PayloadT = io_result_payload_t<
+            awaitable_result_t<Awaitable>>;
+        if constexpr (!std::is_same_v<PayloadT, std::tuple<>>)
+        {
+            state->set_result(index,
+                extract_io_payload(std::move(result)));
+        }
     }
 }
 
-/** Internal awaitable that launches all runner coroutines and waits.
+/** Create a runner for io_result children that requests stop on ec. */
+template<std::size_t Index, IoAwaitable Awaitable, typename... Ts>
+when_all_runner<when_all_state<Ts...>>
+make_when_all_io_runner(Awaitable inner, when_all_state<Ts...>* state)
+{
+    auto result = co_await std::move(inner);
+    auto ec = result.ec;
+    std::get<Index>(state->results_).set(std::move(result));
 
-    This awaitable is used inside the when_all coroutine to handle
-    the concurrent execution of child awaitables.
-*/
+    if(ec)
+    {
+        state->record_error(ec);
+        state->core_.stop_source_.request_stop();
+    }
+}
+
+/** Launcher that uses io_result-aware runners. */
 template<IoAwaitable... Awaitables>
-class when_all_launcher
+class when_all_io_launcher
 {
     using state_type = when_all_state<awaitable_result_t<Awaitables>...>;
 
@@ -289,7 +374,7 @@ class when_all_launcher
     state_type* state_;
 
 public:
-    when_all_launcher(
+    when_all_io_launcher(
         std::tuple<Awaitables...>* awaitables,
         state_type* state)
         : awaitables_(awaitables)
@@ -302,108 +387,158 @@ class when_all_launcher
         return sizeof...(Awaitables) == 0;
     }
 
-    std::coroutine_handle<> await_suspend(std::coroutine_handle<> continuation, io_env const* caller_env)
+    std::coroutine_handle<> await_suspend(
+        std::coroutine_handle<> continuation, io_env const* caller_env)
     {
-        state_->continuation_ = continuation;
-        state_->caller_env_ = caller_env;
+        state_->core_.continuation_.h = continuation;
+        state_->core_.caller_env_ = caller_env;
 
-        // Forward parent's stop requests to children
         if(caller_env->stop_token.stop_possible())
         {
-            state_->parent_stop_callback_.emplace(
+            state_->core_.parent_stop_callback_.emplace(
                 caller_env->stop_token,
-                typename state_type::stop_callback_fn{&state_->stop_source_});
+                when_all_core::stop_callback_fn{&state_->core_.stop_source_});
 
             if(caller_env->stop_token.stop_requested())
-                state_->stop_source_.request_stop();
+                state_->core_.stop_source_.request_stop();
         }
 
-        // CRITICAL: If the last task finishes synchronously then the parent
-        // coroutine resumes, destroying its frame, and destroying this object
-        // prior to the completion of await_suspend. Therefore, await_suspend
-        // must ensure `this` cannot be referenced after calling `launch_one`
-        // for the last time.
-        auto token = state_->stop_source_.get_token();
+        auto token = state_->core_.stop_source_.get_token();
         [&]<std::size_t... Is>(std::index_sequence<Is...>) {
             (..., launch_one<Is>(caller_env->executor, token));
         }(std::index_sequence_for<Awaitables...>{});
 
-        // Let signal_completion() handle resumption
         return std::noop_coroutine();
     }
 
-    void await_resume() const noexcept
-    {
-        // Results are extracted by the when_all coroutine from state
-    }
+    void await_resume() const noexcept {}
 
 private:
     template<std::size_t I>
     void launch_one(executor_ref caller_ex, std::stop_token token)
     {
-        auto runner = make_when_all_runner<I>(
+        auto runner = make_when_all_io_runner<I>(
             std::move(std::get<I>(*awaitables_)), state_);
 
         auto h = runner.release();
         h.promise().state_ = state_;
-        h.promise().env_ = io_env{caller_ex, token, state_->caller_env_->frame_allocator};
+        h.promise().env_ = io_env{caller_ex, token,
+            state_->core_.caller_env_->frame_allocator};
 
-        std::coroutine_handle<> ch{h};
-        state_->runner_handles_[I] = ch;
-        state_->caller_env_->executor.post(ch);
+        state_->runner_handles_[I].h = std::coroutine_handle<>{h};
+        state_->core_.caller_env_->executor.post(state_->runner_handles_[I]);
     }
 };
 
-/** Compute the result type for when_all.
-
-    Returns void when all tasks are void (P2300 aligned),
-    otherwise returns a tuple with void types filtered out.
-*/
-template<typename... Ts>
-using when_all_result_t = std::conditional_t<
-    std::is_same_v<filter_void_tuple_t<Ts...>, std::tuple<>>,
-    void,
-    filter_void_tuple_t<Ts...>>;
-
-/** Helper to extract a single result, returning empty tuple for void.
+/** Helper to extract a single result from state.
     This is a separate function to work around a GCC-11 ICE that occurs
     when using nested immediately-invoked lambdas with pack expansion.
 */
 template<std::size_t I, typename... Ts>
 auto extract_single_result(when_all_state<Ts...>& state)
 {
-    using T = std::tuple_element_t<I, std::tuple<Ts...>>;
-    if constexpr (std::is_void_v<T>)
-        return std::tuple<>();
-    else
-        return std::make_tuple(std::move(std::get<I>(state.results_)).get());
+    return std::move(std::get<I>(state.results_)).get();
 }
 
-/** Extract results from state, filtering void types.
+/** Extract all results from state as a tuple.
 */
 template<typename... Ts>
 auto extract_results(when_all_state<Ts...>& state)
 {
     return [&]<std::size_t... Is>(std::index_sequence<Is...>) {
-        return std::tuple_cat(extract_single_result<Is>(state)...);
+        return std::tuple(extract_single_result<Is>(state)...);
     }(std::index_sequence_for<Ts...>{});
 }
 
+/** Launches all homogeneous runners concurrently.
+
+    Two-phase approach: create all runners first, then post all.
+    This avoids lifetime issues if a task completes synchronously.
+*/
+template<typename Range>
+class when_all_homogeneous_launcher
+{
+    using Awaitable = std::ranges::range_value_t<Range>;
+    using PayloadT = io_result_payload_t<awaitable_result_t<Awaitable>>;
+
+    Range* range_;
+    when_all_homogeneous_state<PayloadT>* state_;
+
+public:
+    when_all_homogeneous_launcher(
+        Range* range,
+        when_all_homogeneous_state<PayloadT>* state)
+        : range_(range)
+        , state_(state)
+    {
+    }
+
+    bool await_ready() const noexcept
+    {
+        return std::ranges::empty(*range_);
+    }
+
+    std::coroutine_handle<> await_suspend(std::coroutine_handle<> continuation, io_env const* caller_env)
+    {
+        state_->core_.continuation_.h = continuation;
+        state_->core_.caller_env_ = caller_env;
+
+        if(caller_env->stop_token.stop_possible())
+        {
+            state_->core_.parent_stop_callback_.emplace(
+                caller_env->stop_token,
+                when_all_core::stop_callback_fn{&state_->core_.stop_source_});
+
+            if(caller_env->stop_token.stop_requested())
+                state_->core_.stop_source_.request_stop();
+        }
+
+        auto token = state_->core_.stop_source_.get_token();
+
+        // Phase 1: Create all runners without dispatching.
+        std::size_t index = 0;
+        for(auto&& a : *range_)
+        {
+            auto runner = make_when_all_homogeneous_runner(
+                std::move(a), state_, index);
+
+            auto h = runner.release();
+            h.promise().state_ = state_;
+            h.promise().index_ = index;
+            h.promise().env_ = io_env{caller_env->executor, token, caller_env->frame_allocator};
+
+            state_->runner_handles_[index].h = std::coroutine_handle<>{h};
+            ++index;
+        }
+
+        // Phase 2: Post all runners. Any may complete synchronously.
+        // After last post, state_ and this may be destroyed.
+        auto* handles = state_->runner_handles_.get();
+        std::size_t count = state_->core_.remaining_count_.load(std::memory_order_relaxed);
+        for(std::size_t i = 0; i < count; ++i)
+            caller_env->executor.post(handles[i]);
+
+        return std::noop_coroutine();
+    }
+
+    void await_resume() const noexcept
+    {
+    }
+};
+
 } // namespace detail
 
-/** Execute multiple awaitables concurrently and collect their results.
+/** Execute a range of io_result-returning awaitables concurrently.
 
-    Launches all awaitables simultaneously and waits for all to complete
-    before returning. Results are collected in input order. If any
-    awaitable throws, cancellation is requested for siblings and the first
-    exception is rethrown after all awaitables complete.
+    Launches all awaitables simultaneously and waits for all to complete.
+    On success, extracted payloads are collected in a vector preserving
+    input order. The first error_code cancels siblings and is propagated
+    in the outer io_result. Exceptions always beat error codes.
 
     @li All child awaitables run concurrently on the caller's executor
-    @li Results are returned as a tuple in input order
-    @li Void-returning awaitables do not contribute to the result tuple
-    @li If all awaitables return void, `when_all` returns `task<void>`
-    @li First exception wins; subsequent exceptions are discarded
-    @li Stop is requested for siblings on first error
+    @li Payloads are returned as a vector in input order
+    @li First error_code wins and cancels siblings
+    @li Exception always beats error_code
     @li Completes only after all children have finished
 
     @par Thread Safety
@@ -411,68 +546,180 @@ auto extract_results(when_all_state<Ts...>& state)
     Child awaitables execute concurrently but complete through the caller's
     executor.
 
-    @param awaitables The awaitables to execute concurrently. Each must
-        satisfy @ref IoAwaitable and is consumed (moved-from) when
-        `when_all` is awaited.
+    @param awaitables Range of io_result-returning awaitables to execute
+        concurrently (must not be empty).
 
-    @return A task yielding a tuple of non-void results. Returns
-        `task<void>` when all input awaitables return void.
+    @return A task yielding io_result<vector<PayloadT>> where PayloadT
+        is the payload extracted from each child's io_result.
+
+    @throws std::invalid_argument if range is empty (thrown before
+        coroutine suspends).
+    @throws Rethrows the first child exception after all children
+        complete (exception beats error_code).
 
     @par Example
+    @code
+    task<void> example()
+    {
+        std::vector<io_task<size_t>> reads;
+        for (auto& buf : buffers)
+            reads.push_back(stream.read_some(buf));
+
+        auto [ec, counts] = co_await when_all(std::move(reads));
+        if (ec) { // handle error
+        }
+    }
+    @endcode
+
+    @see IoAwaitableRange, when_all
+*/
+template<IoAwaitableRange R>
+    requires detail::is_io_result_v<
+        awaitable_result_t<std::ranges::range_value_t<R>>>
+    && (!std::is_same_v<
+            detail::io_result_payload_t<
+                awaitable_result_t<std::ranges::range_value_t<R>>>,
+            std::tuple<>>)
+[[nodiscard]] auto when_all(R&& awaitables)
+    -> task<io_result<std::vector<
+        detail::io_result_payload_t<
+            awaitable_result_t<std::ranges::range_value_t<R>>>>>>
+{
+    using Awaitable = std::ranges::range_value_t<R>;
+    using PayloadT = detail::io_result_payload_t<
+        awaitable_result_t<Awaitable>>;
+    using OwnedRange = std::remove_cvref_t<R>;
+
+    auto count = std::ranges::size(awaitables);
+    if(count == 0)
+        throw std::invalid_argument("when_all requires at least one awaitable");
+
+    OwnedRange owned_awaitables = std::forward<R>(awaitables);
+
+    detail::when_all_homogeneous_state<PayloadT> state(count);
+
+    co_await detail::when_all_homogeneous_launcher<OwnedRange>(
+        &owned_awaitables, &state);
+
+    if(state.core_.first_exception_)
+        std::rethrow_exception(state.core_.first_exception_);
+
+    if(state.has_error_.load(std::memory_order_relaxed))
+        co_return io_result<std::vector<PayloadT>>{state.first_error_, {}};
+
+    std::vector<PayloadT> results;
+    results.reserve(count);
+    for(auto& opt : state.results_)
+        results.push_back(std::move(*opt));
+
+    co_return io_result<std::vector<PayloadT>>{{}, std::move(results)};
+}
+
+/** Execute a range of void io_result-returning awaitables concurrently.
+
+    Launches all awaitables simultaneously and waits for all to complete.
+    Since all awaitables return io_result<>, no payload values are
+    collected. The first error_code cancels siblings and is propagated.
+    Exceptions always beat error codes.
 
+    @param awaitables Range of io_result<>-returning awaitables to
+        execute concurrently (must not be empty).
+
+    @return A task yielding io_result<> whose ec is the first child
+        error, or default-constructed on success.
+
+    @throws std::invalid_argument if range is empty.
+    @throws Rethrows the first child exception after all children
+        complete (exception beats error_code).
+
+    @par Example
     @code
-    task<> example()
+    task<void> example()
     {
-        // Concurrent fetch, results collected in order
-        auto [user, posts] = co_await when_all(
-            fetch_user( id ),      // task<User>
-            fetch_posts( id )      // task<std::vector<Post>>
-        );
-
-        // Void awaitables don't contribute to result
-        co_await when_all(
-            log_event( "start" ),  // task<void>
-            notify_user( id )      // task<void>
-        );
-        // Returns task<void>, no result tuple
+        std::vector<io_task<>> jobs;
+        for (int i = 0; i < n; ++i)
+            jobs.push_back(process(i));
+
+        auto [ec] = co_await when_all(std::move(jobs));
     }
     @endcode
 
-    @see IoAwaitable, task
+    @see IoAwaitableRange, when_all
+*/
+template<IoAwaitableRange R>
+    requires detail::is_io_result_v<
+        awaitable_result_t<std::ranges::range_value_t<R>>>
+    && std::is_same_v<
+            detail::io_result_payload_t<
+                awaitable_result_t<std::ranges::range_value_t<R>>>,
+            std::tuple<>>
+[[nodiscard]] auto when_all(R&& awaitables) -> task<io_result<>>
+{
+    using OwnedRange = std::remove_cvref_t<R>;
+
+    auto count = std::ranges::size(awaitables);
+    if(count == 0)
+        throw std::invalid_argument("when_all requires at least one awaitable");
+
+    OwnedRange owned_awaitables = std::forward<R>(awaitables);
+
+    detail::when_all_homogeneous_state<std::tuple<>> state(count);
+
+    co_await detail::when_all_homogeneous_launcher<OwnedRange>(
+        &owned_awaitables, &state);
+
+    if(state.core_.first_exception_)
+        std::rethrow_exception(state.core_.first_exception_);
+
+    if(state.has_error_.load(std::memory_order_relaxed))
+        co_return io_result<>{state.first_error_};
+
+    co_return io_result<>{};
+}
+
+/** Execute io_result-returning awaitables concurrently, inspecting error codes.
+
+    Overload selected when all children return io_result<Ts...>.
+    The error_code is lifted out of each child into a single outer
+    io_result. On success all values are returned; on failure the
+    first error_code wins.
+
+    @par Exception Safety
+    Exception always beats error_code. If any child throws, the
+    exception is rethrown regardless of error_code results.
+
+    @param awaitables One or more awaitables each returning
+        io_result<Ts...>.
+
+    @return A task yielding io_result<R1, R2, ..., Rn> where each Ri
+        follows the payload flattening rules.
 */
 template<IoAwaitable... As>
+    requires (sizeof...(As) > 0)
+          && detail::all_io_result_awaitables<As...>
 [[nodiscard]] auto when_all(As... awaitables)
-    -> task<detail::when_all_result_t<detail::awaitable_result_t<As>...>>
+    -> task<io_result<
+        detail::io_result_payload_t<awaitable_result_t<As>>...>>
 {
-    using result_type = detail::when_all_result_t<detail::awaitable_result_t<As>...>;
-
-    // State is stored in the coroutine frame, using the frame allocator
-    detail::when_all_state<detail::awaitable_result_t<As>...> state;
+    using result_type = io_result<
+        detail::io_result_payload_t<awaitable_result_t<As>>...>;
 
-    // Store awaitables in the frame
+    detail::when_all_state<awaitable_result_t<As>...> state;
     std::tuple<As...> awaitable_tuple(std::move(awaitables)...);
 
-    // Launch all awaitables and wait for completion
-    co_await detail::when_all_launcher<As...>(&awaitable_tuple, &state);
+    co_await detail::when_all_io_launcher<As...>(&awaitable_tuple, &state);
 
-    // Propagate first exception if any.
-    // Safe without explicit acquire: capture_exception() is sequenced-before
-    // signal_completion()'s acq_rel fetch_sub, which synchronizes-with the
-    // last task's decrement that resumes this coroutine.
-    if(state.first_exception_)
-        std::rethrow_exception(state.first_exception_);
+    // Exception always wins over error_code
+    if(state.core_.first_exception_)
+        std::rethrow_exception(state.core_.first_exception_);
 
-    // Extract and return results
-    if constexpr (std::is_void_v<result_type>)
-        co_return;
-    else
-        co_return detail::extract_results(state);
+    auto r = detail::build_when_all_io_result<result_type>(
+        detail::extract_results(state));
+    if(state.has_error_.load(std::memory_order_relaxed))
+        r.ec = state.first_error_;
+    co_return r;
 }
 
-/// Compute the result type of `when_all` for the given task types.
-template<typename... Ts>
-using when_all_result_type = detail::when_all_result_t<Ts...>;
-
 } // namespace capy
 } // namespace boost
 
diff --git a/include/boost/capy/when_any.hpp b/include/boost/capy/when_any.hpp
index 997139029..d87efc0b8 100644
--- a/include/boost/capy/when_any.hpp
+++ b/include/boost/capy/when_any.hpp
@@ -1,5 +1,6 @@
 //
 // Copyright (c) 2026 Michael Vandeberg
+// Copyright (c) 2026 Steve Gerbino
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -11,10 +12,13 @@
 #define BOOST_CAPY_WHEN_ANY_HPP
 
 #include <boost/capy/detail/config.hpp>
+#include <boost/capy/detail/io_result_combinators.hpp>
+#include <boost/capy/continuation.hpp>
 #include <boost/capy/concept/executor.hpp>
 #include <boost/capy/concept/io_awaitable.hpp>
 #include <coroutine>
 #include <boost/capy/ex/executor_ref.hpp>
+#include <boost/capy/ex/frame_alloc_mixin.hpp>
 #include <boost/capy/ex/frame_allocator.hpp>
 #include <boost/capy/ex/io_env.hpp>
 #include <boost/capy/task.hpp>
@@ -22,6 +26,8 @@
 #include <array>
 #include <atomic>
 #include <exception>
+#include <memory>
+#include <mutex>
 #include <optional>
 #include <ranges>
 #include <stdexcept>
@@ -33,14 +39,16 @@
 #include <vector>
 
 /*
-   when_any - Race multiple tasks, return first completion
-   ========================================================
+   when_any - Race multiple io_result tasks, select first success
+   =============================================================
 
    OVERVIEW:
    ---------
-   when_any launches N tasks concurrently and completes when the FIRST task
-   finishes (success or failure). It then requests stop for all siblings and
-   waits for them to acknowledge before returning.
+   when_any launches N io_result-returning tasks concurrently. A task
+   wins by returning !ec; errors and exceptions do not win. Once a
+   winner is found, stop is requested for siblings and the winner's
+   payload is returned. If no winner exists (all fail), the first
+   error_code is returned or the last exception is rethrown.
 
    ARCHITECTURE:
    -------------
@@ -51,46 +59,39 @@
                 BUT still wait for remaining_count to reach 0 for cleanup
 
    Key components:
-     - when_any_state:    Shared state tracking winner and completion
-     - when_any_runner:   Wrapper coroutine for each child task
-     - when_any_launcher: Awaitable that starts all runners concurrently
+     - when_any_core:    Shared state tracking winner and completion
+     - when_any_io_runner: Wrapper coroutine for each child task
+     - when_any_io_launcher/when_any_io_homogeneous_launcher:
+                          Awaitables that start all runners concurrently
 
    CRITICAL INVARIANTS:
    --------------------
-   1. Exactly one task becomes the winner (via atomic compare_exchange)
+   1. Only a task returning !ec can become the winner (via atomic CAS)
    2. All tasks must complete before parent resumes (cleanup safety)
    3. Stop is requested immediately when winner is determined
-   4. Only the winner's result/exception is stored
+   4. Exceptions and errors do not claim winner status
 
-   TYPE DEDUPLICATION:
+   POSITIONAL VARIANT:
    -------------------
-   std::variant requires unique alternative types. Since when_any can race
-   tasks with identical return types (e.g., three task<int>), we must
-   deduplicate types before constructing the variant.
+   The variadic overload returns std::variant<error_code, R1, R2, ..., Rn>.
+   Index 0 is error_code (failure/no-winner). Index 1..N identifies the
+   winning child and carries its payload.
 
-   Example: when_any(task<int>, task<string>, task<int>)
-     - Raw types after void->monostate: int, string, int
-     - Deduplicated variant: std::variant<int, string>
-     - Return: pair<size_t, variant<int, string>>
-
-   The winner_index tells you which task won (0, 1, or 2), while the variant
-   holds the result. Use the index to determine how to interpret the variant.
-
-   VOID HANDLING:
-   --------------
-   void tasks contribute std::monostate to the variant (then deduplicated).
-   All-void tasks result in: pair<size_t, variant<monostate>>
+   RANGE OVERLOAD:
+   ---------------
+   The range overload returns variant<error_code, pair<size_t, T>> for
+   non-void children or variant<error_code, size_t> for void children.
 
    MEMORY MODEL:
    -------------
    Synchronization chain from winner's write to parent's read:
 
-   1. Winner thread writes result_/winner_exception_ (non-atomic)
-   2. Winner thread calls signal_completion() → fetch_sub(acq_rel) on remaining_count_
+   1. Winner thread writes result_ (non-atomic)
+   2. Winner thread calls signal_completion() -> fetch_sub(acq_rel) on remaining_count_
    3. Last task thread (may be winner or non-winner) calls signal_completion()
-      → fetch_sub(acq_rel) on remaining_count_, observing count becomes 0
+      -> fetch_sub(acq_rel) on remaining_count_, observing count becomes 0
    4. Last task returns caller_ex_.dispatch(continuation_) via symmetric transfer
-   5. Parent coroutine resumes and reads result_/winner_exception_
+   5. Parent coroutine resumes and reads result_
 
    Synchronization analysis:
    - All fetch_sub operations on remaining_count_ form a release sequence
@@ -102,17 +103,12 @@
      (release-on-post, acquire-on-execute) completing the chain to parent
    - Even inline executors work (same thread = sequenced-before)
 
-   Alternative considered: Adding winner_ready_ atomic (set with release after
-   storing winner data, acquired before reading) would make synchronization
-   self-contained and not rely on executor implementation details. Current
-   approach is correct but requires careful reasoning about release sequences
-   and executor behavior.
-
    EXCEPTION SEMANTICS:
    --------------------
-   Unlike when_all (which captures first exception, discards others), when_any
-   treats exceptions as valid completions. If the winning task threw, that
-   exception is rethrown. Exceptions from non-winners are silently discarded.
+   Exceptions do NOT claim winner status. If a child throws, the exception
+   is recorded but the combinator keeps waiting for a success. Only when
+   all children complete without a winner does the combinator check: if
+   any exception was recorded, it is rethrown (exception beats error_code).
 */
 
 namespace boost {
@@ -120,58 +116,6 @@ namespace capy {
 
 namespace detail {
 
-/** Convert void to monostate for variant storage.
-
-    std::variant<void, ...> is ill-formed, so void tasks contribute
-    std::monostate to the result variant instead. Non-void types
-    pass through unchanged.
-
-    @tparam T The type to potentially convert (void becomes monostate).
-*/
-template<typename T>
-using void_to_monostate_t = std::conditional_t<std::is_void_v<T>, std::monostate, T>;
-
-// Type deduplication: std::variant requires unique alternative types.
-// Fold left over the type list, appending each type only if not already present.
-template<typename Variant, typename T>
-struct variant_append_if_unique;
-
-template<typename... Vs, typename T>
-struct variant_append_if_unique<std::variant<Vs...>, T>
-{
-    using type = std::conditional_t<
-        (std::is_same_v<T, Vs> || ...),
-        std::variant<Vs...>,
-        std::variant<Vs..., T>>;
-};
-
-template<typename Accumulated, typename... Remaining>
-struct deduplicate_impl;
-
-template<typename Accumulated>
-struct deduplicate_impl<Accumulated>
-{
-    using type = Accumulated;
-};
-
-template<typename Accumulated, typename T, typename... Rest>
-struct deduplicate_impl<Accumulated, T, Rest...>
-{
-    using next = typename variant_append_if_unique<Accumulated, T>::type;
-    using type = typename deduplicate_impl<next, Rest...>::type;
-};
-
-// Deduplicated variant; void types become monostate before deduplication
-template<typename T0, typename... Ts>
-using unique_variant_t = typename deduplicate_impl<
-    std::variant<void_to_monostate_t<T0>>,
-    void_to_monostate_t<Ts>...>::type;
-
-// Result: (winner_index, deduplicated_variant). Use index to disambiguate
-// when multiple tasks share the same return type.
-template<typename T0, typename... Ts>
-using when_any_result_t = std::pair<std::size_t, unique_variant_t<T0, Ts...>>;
-
 /** Core shared state for when_any operations.
 
     Contains all members and methods common to both heterogeneous (variadic)
@@ -197,7 +141,7 @@ struct when_any_core
     using stop_callback_t = std::stop_callback<stop_callback_fn>;
     std::optional<stop_callback_t> parent_stop_callback_;
 
-    std::coroutine_handle<> continuation_;
+    continuation continuation_;
     io_env const* caller_env_ = nullptr;
 
     // Placed last to avoid padding (1-byte atomic followed by 8-byte aligned members)
@@ -231,78 +175,66 @@ struct when_any_core
     // Runners signal completion directly via final_suspend; no member function needed.
 };
 
-/** Shared state for heterogeneous when_any operation.
-
-    Coordinates winner selection, result storage, and completion tracking
-    for all child tasks in a when_any operation. Uses composition with
-    when_any_core for shared functionality.
+} // namespace detail
 
-    @par Lifetime
-    Allocated on the parent coroutine's frame, outlives all runners.
+namespace detail {
 
-    @tparam T0 First task's result type.
-    @tparam Ts Remaining tasks' result types.
-*/
-template<typename T0, typename... Ts>
-struct when_any_state
+// State for io_result-aware when_any: only !ec wins.
+template<typename... Ts>
+struct when_any_io_state
 {
-    static constexpr std::size_t task_count = 1 + sizeof...(Ts);
-    using variant_type = unique_variant_t<T0, Ts...>;
+    static constexpr std::size_t task_count = sizeof...(Ts);
+    using variant_type = std::variant<std::error_code, Ts...>;
 
     when_any_core core_;
     std::optional<variant_type> result_;
-    std::array<std::coroutine_handle<>, task_count> runner_handles_{};
+    std::array<continuation, task_count> runner_handles_{};
+
+    // Last failure (error or exception) for the all-fail case.
+    // Last writer wins — no priority between errors and exceptions.
+    std::mutex failure_mu_;
+    std::error_code last_error_;
+    std::exception_ptr last_exception_;
 
-    when_any_state()
+    when_any_io_state()
         : core_(task_count)
     {
     }
 
-    // Runners self-destruct in final_suspend. No destruction needed here.
-
-    /** @pre core_.try_win() returned true.
-        @note Uses in_place_type (not index) because variant is deduplicated.
-    */
-    template<typename T>
-    void set_winner_result(T value)
-        noexcept(std::is_nothrow_move_constructible_v<T>)
+    void record_error(std::error_code ec)
     {
-        result_.emplace(std::in_place_type<T>, std::move(value));
+        std::lock_guard lk(failure_mu_);
+        last_error_ = ec;
+        last_exception_ = nullptr;
     }
 
-    /** @pre core_.try_win() returned true. */
-    void set_winner_void() noexcept
+    void record_exception(std::exception_ptr ep)
     {
-        result_.emplace(std::in_place_type<std::monostate>, std::monostate{});
+        std::lock_guard lk(failure_mu_);
+        last_exception_ = ep;
+        last_error_ = {};
     }
 };
 
-/** Wrapper coroutine that runs a single child task for when_any.
-
-    Propagates executor/stop_token to the child, attempts to claim winner
-    status on completion, and signals completion for cleanup coordination.
-
-    @tparam StateType The state type (when_any_state or when_any_homogeneous_state).
-*/
+// Wrapper coroutine for io_result-aware when_any children.
+// unhandled_exception records the exception but does NOT claim winner status.
 template<typename StateType>
-struct when_any_runner
+struct BOOST_CAPY_CORO_DESTROY_WHEN_COMPLETE when_any_io_runner
 {
-    struct promise_type // : frame_allocating_base  // DISABLED FOR TESTING
+    struct promise_type
+        : frame_alloc_mixin
     {
         StateType* state_ = nullptr;
         std::size_t index_ = 0;
         io_env env_;
 
-        when_any_runner get_return_object() noexcept
+        when_any_io_runner get_return_object() noexcept
         {
-            return when_any_runner(std::coroutine_handle<promise_type>::from_promise(*this));
+            return when_any_io_runner(
+                std::coroutine_handle<promise_type>::from_promise(*this));
         }
 
-        // Starts suspended; launcher sets up state/ex/token then resumes
-        std::suspend_always initial_suspend() noexcept
-        {
-            return {};
-        }
+        std::suspend_always initial_suspend() noexcept { return {}; }
 
         auto final_suspend() noexcept
         {
@@ -310,21 +242,19 @@ struct when_any_runner
             {
                 promise_type* p_;
                 bool await_ready() const noexcept { return false; }
-                std::coroutine_handle<> await_suspend(std::coroutine_handle<> h) noexcept
+                auto await_suspend(std::coroutine_handle<> h) noexcept
                 {
-                    // Extract everything needed before self-destruction.
                     auto& core = p_->state_->core_;
                     auto* counter = &core.remaining_count_;
                     auto* caller_env = core.caller_env_;
-                    auto cont = core.continuation_;
+                    auto& cont = core.continuation_;
 
                     h.destroy();
 
-                    // If last runner, dispatch parent for symmetric transfer.
                     auto remaining = counter->fetch_sub(1, std::memory_order_acq_rel);
                     if(remaining == 1)
-                        return caller_env->executor.dispatch(cont);
-                    return std::noop_coroutine();
+                        return detail::symmetric_transfer(caller_env->executor.dispatch(cont));
+                    return detail::symmetric_transfer(std::noop_coroutine());
                 }
                 void await_resume() const noexcept {}
             };
@@ -333,14 +263,12 @@ struct when_any_runner
 
         void return_void() noexcept {}
 
-        // Exceptions are valid completions in when_any (unlike when_all)
-        void unhandled_exception()
+        // Exceptions do NOT win in io_result when_any
+        void unhandled_exception() noexcept
         {
-            if(state_->core_.try_win(index_))
-                state_->core_.set_winner_exception(std::current_exception());
+            state_->record_exception(std::current_exception());
         }
 
-        /** Injects executor and stop token into child awaitables. */
         template<class Awaitable>
         struct transform_awaiter
         {
@@ -348,12 +276,16 @@ struct when_any_runner
             promise_type* p_;
 
             bool await_ready() { return a_.await_ready(); }
-            auto await_resume() { return a_.await_resume(); }
+            decltype(auto) await_resume() { return a_.await_resume(); }
 
             template<class Promise>
             auto await_suspend(std::coroutine_handle<Promise> h)
             {
-                return a_.await_suspend(h, &p_->env_);
+                using R = decltype(a_.await_suspend(h, &p_->env_));
+                if constexpr (std::is_same_v<R, std::coroutine_handle<>>)
+                    return detail::symmetric_transfer(a_.await_suspend(h, &p_->env_));
+                else
+                    return a_.await_suspend(h, &p_->env_);
             }
         };
 
@@ -375,18 +307,19 @@ struct when_any_runner
 
     std::coroutine_handle<promise_type> h_;
 
-    explicit when_any_runner(std::coroutine_handle<promise_type> h) noexcept
+    explicit when_any_io_runner(std::coroutine_handle<promise_type> h) noexcept
         : h_(h)
     {
     }
 
-    // Enable move for all clang versions - some versions need it
-    when_any_runner(when_any_runner&& other) noexcept : h_(std::exchange(other.h_, nullptr)) {}
+    when_any_io_runner(when_any_io_runner&& other) noexcept
+        : h_(std::exchange(other.h_, nullptr))
+    {
+    }
 
-    // Non-copyable
-    when_any_runner(when_any_runner const&) = delete;
-    when_any_runner& operator=(when_any_runner const&) = delete;
-    when_any_runner& operator=(when_any_runner&&) = delete;
+    when_any_io_runner(when_any_io_runner const&) = delete;
+    when_any_io_runner& operator=(when_any_io_runner const&) = delete;
+    when_any_io_runner& operator=(when_any_io_runner&&) = delete;
 
     auto release() noexcept
     {
@@ -394,39 +327,23 @@ struct when_any_runner
     }
 };
 
-/** Wraps a child awaitable, attempts to claim winner on completion.
-
-    Uses requires-expressions to detect state capabilities:
-    - set_winner_void(): for heterogeneous void tasks (stores monostate)
-    - set_winner_result(): for non-void tasks
-    - Neither: for homogeneous void tasks (no result storage)
-*/
-template<IoAwaitable Awaitable, typename StateType>
-when_any_runner<StateType>
-make_when_any_runner(Awaitable inner, StateType* state, std::size_t index)
+// Runner coroutine: only tries to win when the child returns !ec.
+template<std::size_t I, IoAwaitable Awaitable, typename StateType>
+when_any_io_runner<StateType>
+make_when_any_io_runner(Awaitable inner, StateType* state)
 {
-    using T = awaitable_result_t<Awaitable>;
-    if constexpr (std::is_void_v<T>)
-    {
-        co_await std::move(inner);
-        if(state->core_.try_win(index))
-        {
-            // Heterogeneous void tasks store monostate in the variant
-            if constexpr (requires { state->set_winner_void(); })
-                state->set_winner_void();
-            // Homogeneous void tasks have no result to store
-        }
-    }
-    else
+    auto result = co_await std::move(inner);
+
+    if(!result.ec)
     {
-        auto result = co_await std::move(inner);
-        if(state->core_.try_win(index))
+        // Success: try to claim winner
+        if(state->core_.try_win(I))
         {
-            // Defensive: move should not throw (already moved once), but we
-            // catch just in case since an uncaught exception would be devastating.
             try
             {
-                state->set_winner_result(std::move(result));
+                state->result_.emplace(
+                    std::in_place_index<I + 1>,
+                    detail::extract_io_payload(std::move(result)));
             }
             catch(...)
             {
@@ -434,19 +351,25 @@ make_when_any_runner(Awaitable inner, StateType* state, std::size_t index)
             }
         }
     }
+    else
+    {
+        // Error: record but don't win
+        state->record_error(result.ec);
+    }
 }
 
-/** Launches all runners concurrently; see await_suspend for lifetime concerns. */
+// Launcher for io_result-aware when_any.
 template<IoAwaitable... Awaitables>
-class when_any_launcher
+class when_any_io_launcher
 {
-    using state_type = when_any_state<awaitable_result_t<Awaitables>...>;
+    using state_type = when_any_io_state<
+        io_result_payload_t<awaitable_result_t<Awaitables>>...>;
 
     std::tuple<Awaitables...>* tasks_;
     state_type* state_;
 
 public:
-    when_any_launcher(
+    when_any_io_launcher(
         std::tuple<Awaitables...>* tasks,
         state_type* state)
         : tasks_(tasks)
@@ -459,13 +382,10 @@ class when_any_launcher
         return sizeof...(Awaitables) == 0;
     }
 
-    /** CRITICAL: If the last task finishes synchronously, parent resumes and
-        destroys this object before await_suspend returns. Must not reference
-        `this` after the final launch_one call.
-    */
-    std::coroutine_handle<> await_suspend(std::coroutine_handle<> continuation, io_env const* caller_env)
+    std::coroutine_handle<> await_suspend(
+        std::coroutine_handle<> continuation, io_env const* caller_env)
     {
-        state_->core_.continuation_ = continuation;
+        state_->core_.continuation_.h = continuation;
         state_->core_.caller_env_ = caller_env;
 
         if(caller_env->stop_token.stop_possible())
@@ -486,237 +406,146 @@ class when_any_launcher
         return std::noop_coroutine();
     }
 
-    void await_resume() const noexcept
-    {
-    }
+    void await_resume() const noexcept {}
 
 private:
-    /** @pre Ex::dispatch() and std::coroutine_handle<>::resume() must not throw (handle may leak). */
     template<std::size_t I>
     void launch_one(executor_ref caller_ex, std::stop_token token)
     {
-        auto runner = make_when_any_runner(
-            std::move(std::get<I>(*tasks_)), state_, I);
+        auto runner = make_when_any_io_runner<I>(
+            std::move(std::get<I>(*tasks_)), state_);
 
         auto h = runner.release();
         h.promise().state_ = state_;
         h.promise().index_ = I;
-        h.promise().env_ = io_env{caller_ex, token, state_->core_.caller_env_->frame_allocator};
+        h.promise().env_ = io_env{caller_ex, token,
+            state_->core_.caller_env_->frame_allocator};
 
-        std::coroutine_handle<> ch{h};
-        state_->runner_handles_[I] = ch;
-        caller_ex.post(ch);
+        state_->runner_handles_[I].h = std::coroutine_handle<>{h};
+        caller_ex.post(state_->runner_handles_[I]);
     }
 };
 
-} // namespace detail
-
-/** Wait for the first awaitable to complete.
-
-    Races multiple heterogeneous awaitables concurrently and returns when the
-    first one completes. The result includes the winner's index and a
-    deduplicated variant containing the result value.
-
-    @par Suspends
-    The calling coroutine suspends when co_await is invoked. All awaitables
-    are launched concurrently and execute in parallel. The coroutine resumes
-    only after all awaitables have completed, even though the winner is
-    determined by the first to finish.
-
-    @par Completion Conditions
-    @li Winner is determined when the first awaitable completes (success or exception)
-    @li Only one task can claim winner status via atomic compare-exchange
-    @li Once a winner exists, stop is requested for all remaining siblings
-    @li Parent coroutine resumes only after all siblings acknowledge completion
-    @li The winner's result is returned; if the winner threw, the exception is rethrown
-
-    @par Cancellation Semantics
-    Cancellation is supported via stop_token propagated through the
-    IoAwaitable protocol:
-    @li Each child awaitable receives a stop_token derived from a shared stop_source
-    @li When the parent's stop token is activated, the stop is forwarded to all children
-    @li When a winner is determined, stop_source_.request_stop() is called immediately
-    @li Siblings must handle cancellation gracefully and complete before parent resumes
-    @li Stop requests are cooperative; tasks must check and respond to them
-
-    @par Concurrency/Overlap
-    All awaitables are launched concurrently before any can complete.
-    The launcher iterates through the arguments, starting each task on the
-    caller's executor. Tasks may execute in parallel on multi-threaded
-    executors or interleave on single-threaded executors. There is no
-    guaranteed ordering of task completion.
-
-    @par Notable Error Conditions
-    @li Winner exception: if the winning task threw, that exception is rethrown
-    @li Non-winner exceptions: silently discarded (only winner's result matters)
-    @li Cancellation: tasks may complete via cancellation without throwing
-
-    @par Example
-    @code
-    task<void> example() {
-        auto [index, result] = co_await when_any(
-            fetch_from_primary(),   // task<Response>
-            fetch_from_backup()     // task<Response>
-        );
-        // index is 0 or 1, result holds the winner's Response
-        auto response = std::get<Response>(result);
-    }
-    @endcode
-
-    @par Example with Heterogeneous Types
-    @code
-    task<void> mixed_types() {
-        auto [index, result] = co_await when_any(
-            fetch_int(),      // task<int>
-            fetch_string()    // task<std::string>
-        );
-        if (index == 0)
-            std::cout << "Got int: " << std::get<int>(result) << "\n";
-        else
-            std::cout << "Got string: " << std::get<std::string>(result) << "\n";
-    }
-    @endcode
-
-    @tparam A0 First awaitable type (must satisfy IoAwaitable).
-    @tparam As Remaining awaitable types (must satisfy IoAwaitable).
-    @param a0 The first awaitable to race.
-    @param as Additional awaitables to race concurrently.
-    @return A task yielding a pair of (winner_index, result_variant).
-
-    @throws Rethrows the winner's exception if the winning task threw an exception.
-
-    @par Remarks
-    Awaitables are moved into the coroutine frame; original objects become
-    empty after the call. When multiple awaitables share the same return type,
-    the variant is deduplicated to contain only unique types. Use the winner
-    index to determine which awaitable completed first. Void awaitables
-    contribute std::monostate to the variant.
-
-    @see when_all, IoAwaitable
-*/
-template<IoAwaitable A0, IoAwaitable... As>
-[[nodiscard]] auto when_any(A0 a0, As... as)
-    -> task<detail::when_any_result_t<
-        detail::awaitable_result_t<A0>,
-        detail::awaitable_result_t<As>...>>
-{
-    using result_type = detail::when_any_result_t<
-        detail::awaitable_result_t<A0>,
-        detail::awaitable_result_t<As>...>;
-
-    detail::when_any_state<
-        detail::awaitable_result_t<A0>,
-        detail::awaitable_result_t<As>...> state;
-    std::tuple<A0, As...> awaitable_tuple(std::move(a0), std::move(as)...);
-
-    co_await detail::when_any_launcher<A0, As...>(&awaitable_tuple, &state);
-
-    if(state.core_.winner_exception_)
-        std::rethrow_exception(state.core_.winner_exception_);
-
-    co_return result_type{state.core_.winner_index_, std::move(*state.result_)};
-}
-
-/** Concept for ranges of full I/O awaitables.
-
-    A range satisfies `IoAwaitableRange` if it is a sized input range
-    whose value type satisfies @ref IoAwaitable. This enables when_any
-    to accept any container or view of awaitables, not just std::vector.
-
-    @tparam R The range type.
-
-    @par Requirements
-    @li `R` must satisfy `std::ranges::input_range`
-    @li `R` must satisfy `std::ranges::sized_range`
-    @li `std::ranges::range_value_t<R>` must satisfy @ref IoAwaitable
-
-    @par Syntactic Requirements
-    Given `r` of type `R`:
-    @li `std::ranges::begin(r)` is valid
-    @li `std::ranges::end(r)` is valid
-    @li `std::ranges::size(r)` returns `std::ranges::range_size_t<R>`
-    @li `*std::ranges::begin(r)` satisfies @ref IoAwaitable
-
-    @par Example
-    @code
-    template<IoAwaitableRange R>
-    task<void> race_all(R&& awaitables) {
-        auto winner = co_await when_any(std::forward<R>(awaitables));
-        // Process winner...
-    }
-    @endcode
-
-    @see when_any, IoAwaitable
-*/
-template<typename R>
-concept IoAwaitableRange =
-    std::ranges::input_range<R> &&
-    std::ranges::sized_range<R> &&
-    IoAwaitable<std::ranges::range_value_t<R>>;
-
-namespace detail {
-
-/** Shared state for homogeneous when_any (range overload).
+/** Shared state for homogeneous io_result-aware when_any (range overload).
 
-    Uses composition with when_any_core for shared functionality.
-    Simpler than heterogeneous: optional<T> instead of variant, vector
-    instead of array for runner handles.
+    @tparam T The payload type extracted from io_result.
 */
 template<typename T>
-struct when_any_homogeneous_state
+struct when_any_io_homogeneous_state
 {
     when_any_core core_;
     std::optional<T> result_;
-    std::vector<std::coroutine_handle<>> runner_handles_;
+    std::unique_ptr<continuation[]> runner_handles_;
 
-    explicit when_any_homogeneous_state(std::size_t count)
+    std::mutex failure_mu_;
+    std::error_code last_error_;
+    std::exception_ptr last_exception_;
+
+    explicit when_any_io_homogeneous_state(std::size_t count)
         : core_(count)
-        , runner_handles_(count)
+        , runner_handles_(std::make_unique<continuation[]>(count))
     {
     }
 
-    // Runners self-destruct in final_suspend. No destruction needed here.
+    void record_error(std::error_code ec)
+    {
+        std::lock_guard lk(failure_mu_);
+        last_error_ = ec;
+        last_exception_ = nullptr;
+    }
 
-    /** @pre core_.try_win() returned true. */
-    void set_winner_result(T value)
-        noexcept(std::is_nothrow_move_constructible_v<T>)
+    void record_exception(std::exception_ptr ep)
     {
-        result_.emplace(std::move(value));
+        std::lock_guard lk(failure_mu_);
+        last_exception_ = ep;
+        last_error_ = {};
     }
 };
 
-/** Specialization for void tasks (no result storage needed). */
+/** Specialization for void io_result children (no payload storage). */
 template<>
-struct when_any_homogeneous_state<void>
+struct when_any_io_homogeneous_state<std::tuple<>>
 {
     when_any_core core_;
-    std::vector<std::coroutine_handle<>> runner_handles_;
+    std::unique_ptr<continuation[]> runner_handles_;
 
-    explicit when_any_homogeneous_state(std::size_t count)
+    std::mutex failure_mu_;
+    std::error_code last_error_;
+    std::exception_ptr last_exception_;
+
+    explicit when_any_io_homogeneous_state(std::size_t count)
         : core_(count)
-        , runner_handles_(count)
+        , runner_handles_(std::make_unique<continuation[]>(count))
     {
     }
 
-    // Runners self-destruct in final_suspend. No destruction needed here.
+    void record_error(std::error_code ec)
+    {
+        std::lock_guard lk(failure_mu_);
+        last_error_ = ec;
+        last_exception_ = nullptr;
+    }
 
-    // No set_winner_result - void tasks have no result to store
+    void record_exception(std::exception_ptr ep)
+    {
+        std::lock_guard lk(failure_mu_);
+        last_exception_ = ep;
+        last_error_ = {};
+    }
 };
 
-/** Launches all runners concurrently; see await_suspend for lifetime concerns. */
+/** Create an io_result-aware runner for homogeneous when_any (range path).
+
+    Only tries to win when the child returns !ec.
+*/
+template<IoAwaitable Awaitable, typename StateType>
+when_any_io_runner<StateType>
+make_when_any_io_homogeneous_runner(
+    Awaitable inner, StateType* state, std::size_t index)
+{
+    auto result = co_await std::move(inner);
+
+    if(!result.ec)
+    {
+        if(state->core_.try_win(index))
+        {
+            using PayloadT = io_result_payload_t<
+                awaitable_result_t<Awaitable>>;
+            if constexpr (!std::is_same_v<PayloadT, std::tuple<>>)
+            {
+                try
+                {
+                    state->result_.emplace(
+                        extract_io_payload(std::move(result)));
+                }
+                catch(...)
+                {
+                    state->core_.set_winner_exception(
+                        std::current_exception());
+                }
+            }
+        }
+    }
+    else
+    {
+        state->record_error(result.ec);
+    }
+}
+
+/** Launches all io_result-aware homogeneous runners concurrently. */
 template<IoAwaitableRange Range>
-class when_any_homogeneous_launcher
+class when_any_io_homogeneous_launcher
 {
     using Awaitable = std::ranges::range_value_t<Range>;
-    using T = awaitable_result_t<Awaitable>;
+    using PayloadT = io_result_payload_t<awaitable_result_t<Awaitable>>;
 
     Range* range_;
-    when_any_homogeneous_state<T>* state_;
+    when_any_io_homogeneous_state<PayloadT>* state_;
 
 public:
-    when_any_homogeneous_launcher(
+    when_any_io_homogeneous_launcher(
         Range* range,
-        when_any_homogeneous_state<T>* state)
+        when_any_io_homogeneous_state<PayloadT>* state)
         : range_(range)
         , state_(state)
     {
@@ -727,17 +556,10 @@ class when_any_homogeneous_launcher
         return std::ranges::empty(*range_);
     }
 
-    /** CRITICAL: If the last task finishes synchronously, parent resumes and
-        destroys this object before await_suspend returns. Must not reference
-        `this` after dispatching begins.
-
-        Two-phase approach:
-        1. Create all runners (safe - no dispatch yet)
-        2. Dispatch all runners (any may complete synchronously)
-    */
-    std::coroutine_handle<> await_suspend(std::coroutine_handle<> continuation, io_env const* caller_env)
+    std::coroutine_handle<> await_suspend(
+        std::coroutine_handle<> continuation, io_env const* caller_env)
     {
-        state_->core_.continuation_ = continuation;
+        state_->core_.continuation_.h = continuation;
         state_->core_.caller_env_ = caller_env;
 
         if(caller_env->stop_token.stop_possible())
@@ -753,252 +575,231 @@ class when_any_homogeneous_launcher
         auto token = state_->core_.stop_source_.get_token();
 
         // Phase 1: Create all runners without dispatching.
-        // This iterates over *range_ safely because no runners execute yet.
         std::size_t index = 0;
         for(auto&& a : *range_)
         {
-            auto runner = make_when_any_runner(
+            auto runner = make_when_any_io_homogeneous_runner(
                 std::move(a), state_, index);
 
             auto h = runner.release();
             h.promise().state_ = state_;
             h.promise().index_ = index;
-            h.promise().env_ = io_env{caller_env->executor, token, caller_env->frame_allocator};
+            h.promise().env_ = io_env{caller_env->executor, token,
+                caller_env->frame_allocator};
 
-            state_->runner_handles_[index] = std::coroutine_handle<>{h};
+            state_->runner_handles_[index].h = std::coroutine_handle<>{h};
             ++index;
         }
 
         // Phase 2: Post all runners. Any may complete synchronously.
-        // After last post, state_ and this may be destroyed.
-        // Use raw pointer/count captured before posting.
-        std::coroutine_handle<>* handles = state_->runner_handles_.data();
-        std::size_t count = state_->runner_handles_.size();
+        auto* handles = state_->runner_handles_.get();
+        std::size_t count = state_->core_.remaining_count_.load(std::memory_order_relaxed);
         for(std::size_t i = 0; i < count; ++i)
             caller_env->executor.post(handles[i]);
 
         return std::noop_coroutine();
     }
 
-    void await_resume() const noexcept
-    {
-    }
+    void await_resume() const noexcept {}
 };
 
 } // namespace detail
 
-/** Wait for the first awaitable to complete (range overload).
-
-    Races a range of awaitables with the same result type. Accepts any
-    sized input range of IoAwaitable types, enabling use with arrays,
-    spans, or custom containers.
-
-    @par Suspends
-    The calling coroutine suspends when co_await is invoked. All awaitables
-    in the range are launched concurrently and execute in parallel. The
-    coroutine resumes only after all awaitables have completed, even though
-    the winner is determined by the first to finish.
-
-    @par Completion Conditions
-    @li Winner is determined when the first awaitable completes (success or exception)
-    @li Only one task can claim winner status via atomic compare-exchange
-    @li Once a winner exists, stop is requested for all remaining siblings
-    @li Parent coroutine resumes only after all siblings acknowledge completion
-    @li The winner's index and result are returned; if the winner threw, the exception is rethrown
-
-    @par Cancellation Semantics
-    Cancellation is supported via stop_token propagated through the
-    IoAwaitable protocol:
-    @li Each child awaitable receives a stop_token derived from a shared stop_source
-    @li When the parent's stop token is activated, the stop is forwarded to all children
-    @li When a winner is determined, stop_source_.request_stop() is called immediately
-    @li Siblings must handle cancellation gracefully and complete before parent resumes
-    @li Stop requests are cooperative; tasks must check and respond to them
-
-    @par Concurrency/Overlap
-    All awaitables are launched concurrently before any can complete.
-    The launcher iterates through the range, starting each task on the
-    caller's executor. Tasks may execute in parallel on multi-threaded
-    executors or interleave on single-threaded executors. There is no
-    guaranteed ordering of task completion.
-
-    @par Notable Error Conditions
-    @li Empty range: throws std::invalid_argument immediately (not via co_return)
-    @li Winner exception: if the winning task threw, that exception is rethrown
-    @li Non-winner exceptions: silently discarded (only winner's result matters)
-    @li Cancellation: tasks may complete via cancellation without throwing
+/** Race a range of io_result-returning awaitables (non-void payloads).
 
-    @par Example
-    @code
-    task<void> example() {
-        std::array<task<Response>, 3> requests = {
-            fetch_from_server(0),
-            fetch_from_server(1),
-            fetch_from_server(2)
-        };
+    Only a child returning !ec can win. Errors and exceptions do not
+    claim winner status. If all children fail, the last failure
+    is reported — either the last error_code at variant index 0,
+    or the last exception rethrown.
 
-        auto [index, response] = co_await when_any(std::move(requests));
-    }
-    @endcode
+    @param awaitables Range of io_result-returning awaitables (must
+        not be empty).
+
+    @return A task yielding variant<error_code, pair<size_t, PayloadT>>
+        where index 0 is failure and index 1 carries the winner's
+        index and payload.
+
+    @throws std::invalid_argument if range is empty.
+    @throws Rethrows last exception when no winner and the last
+        failure was an exception.
 
-    @par Example with Vector
+    @par Example
     @code
-    task<Response> fetch_fastest(std::vector<Server> const& servers) {
-        std::vector<task<Response>> requests;
-        for (auto const& server : servers)
-            requests.push_back(fetch_from(server));
+    task<void> example()
+    {
+        std::vector<io_task<size_t>> reads;
+        for (auto& buf : buffers)
+            reads.push_back(stream.read_some(buf));
 
-        auto [index, response] = co_await when_any(std::move(requests));
-        co_return response;
+        auto result = co_await when_any(std::move(reads));
+        if (result.index() == 1)
+        {
+            auto [idx, n] = std::get<1>(result);
+        }
     }
     @endcode
 
-    @tparam R Range type satisfying IoAwaitableRange.
-    @param awaitables Range of awaitables to race concurrently (must not be empty).
-    @return A task yielding a pair of (winner_index, result).
-
-    @throws std::invalid_argument if range is empty (thrown before coroutine suspends).
-    @throws Rethrows the winner's exception if the winning task threw an exception.
-
-    @par Remarks
-    Elements are moved from the range; for lvalue ranges, the original
-    container will have moved-from elements after this call. The range
-    is moved onto the coroutine frame to ensure lifetime safety. Unlike
-    the variadic overload, no variant wrapper is needed since all tasks
-    share the same return type.
-
-    @see when_any, IoAwaitableRange
+    @see IoAwaitableRange, when_any
 */
 template<IoAwaitableRange R>
-    requires (!std::is_void_v<detail::awaitable_result_t<std::ranges::range_value_t<R>>>)
+    requires detail::is_io_result_v<
+        awaitable_result_t<std::ranges::range_value_t<R>>>
+    && (!std::is_same_v<
+            detail::io_result_payload_t<
+                awaitable_result_t<std::ranges::range_value_t<R>>>,
+            std::tuple<>>)
 [[nodiscard]] auto when_any(R&& awaitables)
-    -> task<std::pair<std::size_t, detail::awaitable_result_t<std::ranges::range_value_t<R>>>>
+    -> task<std::variant<std::error_code,
+        std::pair<std::size_t,
+            detail::io_result_payload_t<
+                awaitable_result_t<std::ranges::range_value_t<R>>>>>>
 {
     using Awaitable = std::ranges::range_value_t<R>;
-    using T = detail::awaitable_result_t<Awaitable>;
-    using result_type = std::pair<std::size_t, T>;
+    using PayloadT = detail::io_result_payload_t<
+        awaitable_result_t<Awaitable>>;
+    using result_type = std::variant<std::error_code,
+        std::pair<std::size_t, PayloadT>>;
     using OwnedRange = std::remove_cvref_t<R>;
 
     auto count = std::ranges::size(awaitables);
     if(count == 0)
         throw std::invalid_argument("when_any requires at least one awaitable");
 
-    // Move/copy range onto coroutine frame to ensure lifetime
     OwnedRange owned_awaitables = std::forward<R>(awaitables);
 
-    detail::when_any_homogeneous_state<T> state(count);
+    detail::when_any_io_homogeneous_state<PayloadT> state(count);
 
-    co_await detail::when_any_homogeneous_launcher<OwnedRange>(&owned_awaitables, &state);
+    co_await detail::when_any_io_homogeneous_launcher<OwnedRange>(
+        &owned_awaitables, &state);
 
-    if(state.core_.winner_exception_)
-        std::rethrow_exception(state.core_.winner_exception_);
+    // Winner found
+    if(state.core_.has_winner_.load(std::memory_order_acquire))
+    {
+        if(state.core_.winner_exception_)
+            std::rethrow_exception(state.core_.winner_exception_);
+        co_return result_type{std::in_place_index<1>,
+            std::pair{state.core_.winner_index_, std::move(*state.result_)}};
+    }
 
-    co_return result_type{state.core_.winner_index_, std::move(*state.result_)};
+    // No winner — report last failure
+    if(state.last_exception_)
+        std::rethrow_exception(state.last_exception_);
+    co_return result_type{std::in_place_index<0>, state.last_error_};
 }
 
-/** Wait for the first awaitable to complete (void range overload).
-
-    Races a range of void-returning awaitables. Since void awaitables have
-    no result value, only the winner's index is returned.
-
-    @par Suspends
-    The calling coroutine suspends when co_await is invoked. All awaitables
-    in the range are launched concurrently and execute in parallel. The
-    coroutine resumes only after all awaitables have completed, even though
-    the winner is determined by the first to finish.
-
-    @par Completion Conditions
-    @li Winner is determined when the first awaitable completes (success or exception)
-    @li Only one task can claim winner status via atomic compare-exchange
-    @li Once a winner exists, stop is requested for all remaining siblings
-    @li Parent coroutine resumes only after all siblings acknowledge completion
-    @li The winner's index is returned; if the winner threw, the exception is rethrown
-
-    @par Cancellation Semantics
-    Cancellation is supported via stop_token propagated through the
-    IoAwaitable protocol:
-    @li Each child awaitable receives a stop_token derived from a shared stop_source
-    @li When the parent's stop token is activated, the stop is forwarded to all children
-    @li When a winner is determined, stop_source_.request_stop() is called immediately
-    @li Siblings must handle cancellation gracefully and complete before parent resumes
-    @li Stop requests are cooperative; tasks must check and respond to them
-
-    @par Concurrency/Overlap
-    All awaitables are launched concurrently before any can complete.
-    The launcher iterates through the range, starting each task on the
-    caller's executor. Tasks may execute in parallel on multi-threaded
-    executors or interleave on single-threaded executors. There is no
-    guaranteed ordering of task completion.
-
-    @par Notable Error Conditions
-    @li Empty range: throws std::invalid_argument immediately (not via co_return)
-    @li Winner exception: if the winning task threw, that exception is rethrown
-    @li Non-winner exceptions: silently discarded (only winner's result matters)
-    @li Cancellation: tasks may complete via cancellation without throwing
+/** Race a range of void io_result-returning awaitables.
 
-    @par Example
-    @code
-    task<void> example() {
-        std::vector<task<void>> tasks;
-        for (int i = 0; i < 5; ++i)
-            tasks.push_back(background_work(i));
+    Only a child returning !ec can win. Returns the winner's index
+    at variant index 1, or error_code at index 0 on all-fail.
 
-        std::size_t winner = co_await when_any(std::move(tasks));
-        // winner is the index of the first task to complete
-    }
-    @endcode
+    @param awaitables Range of io_result<>-returning awaitables (must
+        not be empty).
+
+    @return A task yielding variant<error_code, size_t> where index 0
+        is failure and index 1 carries the winner's index.
 
-    @par Example with Timeout
+    @throws std::invalid_argument if range is empty.
+    @throws Rethrows first exception when no winner and at least one
+        child threw.
+
+    @par Example
     @code
-    task<void> with_timeout() {
-        std::vector<task<void>> tasks;
-        tasks.push_back(long_running_operation());
-        tasks.push_back(delay(std::chrono::seconds(5)));
-
-        std::size_t winner = co_await when_any(std::move(tasks));
-        if (winner == 1) {
-            // Timeout occurred
+    task<void> example()
+    {
+        std::vector<io_task<>> jobs;
+        jobs.push_back(background_work_a());
+        jobs.push_back(background_work_b());
+
+        auto result = co_await when_any(std::move(jobs));
+        if (result.index() == 1)
+        {
+            auto winner = std::get<1>(result);
         }
     }
     @endcode
 
-    @tparam R Range type satisfying IoAwaitableRange with void result.
-    @param awaitables Range of void awaitables to race concurrently (must not be empty).
-    @return A task yielding the winner's index (zero-based).
-
-    @throws std::invalid_argument if range is empty (thrown before coroutine suspends).
-    @throws Rethrows the winner's exception if the winning task threw an exception.
-
-    @par Remarks
-    Elements are moved from the range; for lvalue ranges, the original
-    container will have moved-from elements after this call. The range
-    is moved onto the coroutine frame to ensure lifetime safety. Unlike
-    the non-void overload, no result storage is needed since void tasks
-    produce no value.
-
-    @see when_any, IoAwaitableRange
+    @see IoAwaitableRange, when_any
 */
 template<IoAwaitableRange R>
-    requires std::is_void_v<detail::awaitable_result_t<std::ranges::range_value_t<R>>>
-[[nodiscard]] auto when_any(R&& awaitables) -> task<std::size_t>
+    requires detail::is_io_result_v<
+        awaitable_result_t<std::ranges::range_value_t<R>>>
+    && std::is_same_v<
+            detail::io_result_payload_t<
+                awaitable_result_t<std::ranges::range_value_t<R>>>,
+            std::tuple<>>
+[[nodiscard]] auto when_any(R&& awaitables)
+    -> task<std::variant<std::error_code, std::size_t>>
 {
     using OwnedRange = std::remove_cvref_t<R>;
+    using result_type = std::variant<std::error_code, std::size_t>;
 
     auto count = std::ranges::size(awaitables);
     if(count == 0)
         throw std::invalid_argument("when_any requires at least one awaitable");
 
-    // Move/copy range onto coroutine frame to ensure lifetime
     OwnedRange owned_awaitables = std::forward<R>(awaitables);
 
-    detail::when_any_homogeneous_state<void> state(count);
+    detail::when_any_io_homogeneous_state<std::tuple<>> state(count);
+
+    co_await detail::when_any_io_homogeneous_launcher<OwnedRange>(
+        &owned_awaitables, &state);
+
+    // Winner found
+    if(state.core_.has_winner_.load(std::memory_order_acquire))
+    {
+        if(state.core_.winner_exception_)
+            std::rethrow_exception(state.core_.winner_exception_);
+        co_return result_type{std::in_place_index<1>,
+            state.core_.winner_index_};
+    }
+
+    // No winner — report last failure
+    if(state.last_exception_)
+        std::rethrow_exception(state.last_exception_);
+    co_return result_type{std::in_place_index<0>, state.last_error_};
+}
+
+/** Race io_result-returning awaitables, selecting the first success.
+
+    Overload selected when all children return io_result<Ts...>.
+    Only a child returning !ec can win. Errors and exceptions do
+    not claim winner status.
+
+    @return A task yielding variant<error_code, R1, ..., Rn> where
+        index 0 is the failure/no-winner case and index i+1
+        identifies the winning child.
+*/
+template<IoAwaitable... As>
+    requires (sizeof...(As) > 0)
+          && detail::all_io_result_awaitables<As...>
+[[nodiscard]] auto when_any(As... as)
+    -> task<std::variant<
+        std::error_code,
+        detail::io_result_payload_t<awaitable_result_t<As>>...>>
+{
+    using result_type = std::variant<
+        std::error_code,
+        detail::io_result_payload_t<awaitable_result_t<As>>...>;
+
+    detail::when_any_io_state<
+        detail::io_result_payload_t<awaitable_result_t<As>>...> state;
+    std::tuple<As...> awaitable_tuple(std::move(as)...);
+
+    co_await detail::when_any_io_launcher<As...>(
+        &awaitable_tuple, &state);
 
-    co_await detail::when_any_homogeneous_launcher<OwnedRange>(&owned_awaitables, &state);
+    // Winner found: return their result
+    if(state.result_.has_value())
+        co_return std::move(*state.result_);
 
+    // Winner claimed but payload construction failed
     if(state.core_.winner_exception_)
         std::rethrow_exception(state.core_.winner_exception_);
 
-    co_return state.core_.winner_index_;
+    // No winner — report last failure
+    if(state.last_exception_)
+        std::rethrow_exception(state.last_exception_);
+    co_return result_type{std::in_place_index<0>, state.last_error_};
 }
 
 } // namespace capy
diff --git a/include/boost/capy/write.hpp b/include/boost/capy/write.hpp
index 347c1974f..70256af32 100644
--- a/include/boost/capy/write.hpp
+++ b/include/boost/capy/write.hpp
@@ -13,7 +13,7 @@
 #include <boost/capy/detail/config.hpp>
 #include <boost/capy/io_task.hpp>
 #include <boost/capy/buffers.hpp>
-#include <boost/capy/buffers/consuming_buffers.hpp>
+#include <boost/capy/buffers/buffer_slice.hpp>
 #include <boost/capy/concept/write_stream.hpp>
 #include <system_error>
 
@@ -22,62 +22,86 @@
 namespace boost {
 namespace capy {
 
-/** Asynchronously write the entire buffer sequence.
+/** Write an entire buffer sequence to a stream.
 
-    Writes data to the stream by calling `write_some` repeatedly
-    until the entire buffer sequence is written or an error occurs.
+    @par Await-effects
 
-    @li The operation completes when:
-    @li The entire buffer sequence has been written
-    @li An error occurs
-    @li The operation is cancelled
+    Writes the contents of `buffers` to `stream` via awaiting
+    `stream.write_some` with consecutive portions of data from `buffers`
+    until:
 
-    @par Cancellation
-    Supports cancellation via `stop_token` propagated through the
-    IoAwaitable protocol. When cancelled, returns with `cond::canceled`.
+    @li either the full content of @c buffers is processed,
+    @li or a contingency occurs.
 
-    @param stream The stream to write to. The caller retains ownership.
-    @param buffers The buffer sequence to write. The caller retains
-        ownership and must ensure validity until the operation completes.
+    If `buffer_size(buffers) == 0` then no awaiting `stream.write_some`
+    is performed. This is not a contingency.
 
-    @return An awaitable yielding `(error_code, std::size_t)`.
-        On success, `n` equals `buffer_size(buffers)`. On error,
-        `n` is the number of bytes written before the error. Compare
-        error codes to conditions:
-        @li `cond::canceled` - Operation was cancelled
-        @li `std::errc::broken_pipe` - Peer closed connection
+
+    @par Await-returns
+
+    An object of type `io_result<std::size_t>` destructuring as `[ec, n]`.
+
+    Upon a contingency, `n` represents the number of bytes written
+    so far.
+
+    Otherwise `n` represents the number of bytes written.
+
+    Contingencies:
+
+    @li The first contingency reported from
+    awaiting @c stream.write_some .
+
+    Notable conditions:
+
+    @li @c cond::canceled — Operation was cancelled,
+    @li @c std::errc::broken_pipe — Peer closed connection.
+
+
+    @par Await-postcondition
+
+    `ec || n == buffer_size(buffers)`.
+
+
+    @param stream The stream to write to. If the lifetime of `stream` ends
+    before the coroutine finishes, the behavior is undefined.
+
+    @param buffers The buffer sequence to write. If the lifetime of the buffer
+    sequence represented by `buffers` ends
+    before the coroutine finishes, the behavior is undefined.
+
+    @par Remarks
+
+    Supports _IoAwaitable cancellation_.
 
     @par Example
 
     @code
-    task<> send_response( WriteStream auto& stream, std::string_view body )
+    capy::task<> send_response(capy::WriteStream auto& stream, std::string_view body)
     {
-        auto [ec, n] = co_await write( stream, make_buffer( body ) );
-        if( ec )
-            detail::throw_system_error( ec );
+        auto [ec, n] = co_await capy::write(stream, capy::make_buffer(body));
+        if (ec)
+            throw std::system_error(ec);
+
         // All bytes written successfully
     }
     @endcode
 
-    @see write_some, WriteStream, ConstBufferSequence
+    @see WriteStream, ConstBufferSequence, IoAwaitable, io_result, cond.
 */
-auto
-write(
-    WriteStream auto& stream,
-    ConstBufferSequence auto const& buffers) ->
-        io_task<std::size_t>
+template <WriteStream S, ConstBufferSequence CB>
+auto write(S& stream, CB buffers) -> io_task<std::size_t>
 {
-    consuming_buffers consuming(buffers);
+    auto consuming = buffer_slice(buffers);
     std::size_t const total_size = buffer_size(buffers);
     std::size_t total_written = 0;
 
     while(total_written < total_size)
     {
-        auto [ec, n] = co_await stream.write_some(consuming);
+        auto [ec, n] = co_await stream.write_some(consuming.data());
+        consuming.remove_prefix(n);
+        total_written += n;
         if(ec)
             co_return {ec, total_written};
-        consuming.consume(n);
-        total_written += n;
     }
 
     co_return {{}, total_written};
diff --git a/papers/B1005.io-streamables.md b/papers/B1005.io-streamables.md
index 099ddd969..a3b40311c 100644
--- a/papers/B1005.io-streamables.md
+++ b/papers/B1005.io-streamables.md
@@ -87,13 +87,14 @@ concept ReadStream =
 
 **Semantic requirements:**
 
-If `buffer_size(buffers) > 0`, the operation reads one or more bytes:
+Attempts to read up to `buffer_size(buffers)` bytes from the stream into the buffer sequence.
 
-- On success: `!ec.failed()`, and `n >= 1`
-- On error: `ec.failed()`, and `n == 0`
-- On end-of-file: `ec == cond::eof`, and `n == 0`
+If `buffer_size(buffers) > 0`:
 
-If `buffer_empty(buffers)` is `true`, the operation completes immediately with `!ec.failed()` and `n == 0`.
+- If `!ec`, then `n >= 1 && n <= buffer_size(buffers)`. `n` bytes were read into the buffer sequence.
+- If `ec`, then `n >= 0 && n <= buffer_size(buffers)`. `n` is the number of bytes read before the I/O condition arose.
+
+If `buffer_empty(buffers)` is `true`, `n` is 0. The empty buffer is not itself a cause for error, but `ec` may reflect the state of the stream.
 
 ### 2.2 WriteStream
 
@@ -113,7 +114,14 @@ concept WriteStream =
 
 **Semantic requirements:**
 
-The operation writes one or more bytes. On success, `n` indicates bytes written (at least 1 if `buffer_size(buffers) > 0`). The caller must loop to write remaining data.
+Attempts to write up to `buffer_size(buffers)` bytes from the buffer sequence to the stream.
+
+If `buffer_size(buffers) > 0`:
+
+- If `!ec`, then `n >= 1 && n <= buffer_size(buffers)`. `n` bytes were written from the buffer sequence.
+- If `ec`, then `n >= 0 && n <= buffer_size(buffers)`. `n` is the number of bytes written before the I/O condition arose.
+
+If `buffer_empty(buffers)` is `true`, `n` is 0. The empty buffer is not itself a cause for error, but `ec` may reflect the state of the stream. The caller must loop to write remaining data.
 
 ### 2.3 ReadSource
 
@@ -186,9 +194,9 @@ concept BufferSource =
 
 The `pull` operation fills the provided buffer descriptor array with data from internal storage. On return:
 
-- **Data available**: `!ec.failed()` and `count > 0`. The array contains `count` buffer descriptors.
-- **Source exhausted**: `!ec.failed()` and `count == 0`. The transfer is complete.
-- **Error**: `ec.failed()`.
+- **Data available**: `!ec` and `count > 0`. The array contains `count` buffer descriptors.
+- **Source exhausted**: `!ec` and `count == 0`. The transfer is complete.
+- **Error**: `ec`.
 
 Calling `pull` multiple times without intervening `consume` returns the same unconsumed data. The `consume(n)` operation advances the read position by `n` bytes. The next `pull` returns data starting after the consumed bytes.
 
@@ -502,7 +510,7 @@ io_task<std::size_t> transfer(Source& source, Sink& sink)
     for(;;)
     {
         auto [ec, count] = co_await source.pull(arr, 16);
-        if(ec.failed())
+        if(ec)
             co_return {ec, total};
         if(count == 0)
         {
@@ -510,7 +518,7 @@ io_task<std::size_t> transfer(Source& source, Sink& sink)
             co_return {eof_ec, total};
         }
         auto [write_ec, n] = co_await sink.write(std::span(arr, count));
-        if(write_ec.failed())
+        if(write_ec)
             co_return {write_ec, total};
         source.consume(n);
         total += n;
@@ -533,7 +541,7 @@ task<> process_response(http_response& resp)
         auto [ec, n] = co_await body.read(mutable_buffer(buffer, sizeof(buffer)));
         if(ec == cond::eof)
             break;
-        if(ec.failed())
+        if(ec)
             throw system_error(ec);
         process_chunk(buffer, n);
     }
@@ -550,7 +558,7 @@ task<> send_compressed(any_write_sink& output, std::string_view data)
     zlib_sink compressor(output);  // Wraps output, satisfies WriteSink
     
     auto [ec, n] = co_await compressor.write(make_buffer(data), true);
-    if(ec.failed())
+    if(ec)
         throw system_error(ec);
 }
 ```
@@ -751,7 +759,7 @@ template<mutable_buffer_sequence MB>
 unspecified read_some(MB buffers);
 ```
 
-*Returns:* An awaitable yielding `(error_code, size_t)`.
+*Returns:* An awaitable that await-returns `(error_code, size_t)`.
 
 *Effects:* Equivalent to calling `read_some(buffers)` on the wrapped stream.
 
diff --git a/src/buffers/buffer_array.cpp b/src/buffers/buffer_array.cpp
deleted file mode 100644
index 03af25cae..000000000
--- a/src/buffers/buffer_array.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-//
-// Copyright (c) 2025 Vinnie Falco (vinnie.falco@gmail.com)
-//
-// Distributed under the Boost Software License, Version 1.0. (See accompanying
-// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-//
-// Official repository: https://github.com/cppalliance/capy
-//
-
-#include <boost/capy/buffers/buffer_array.hpp>
-
-namespace boost {
-namespace capy {
-namespace detail {
-
-namespace {
-
-template<class Buffer>
-void
-do_remove_prefix(
-    Buffer* arr,
-    std::size_t* count,
-    std::size_t* total_size,
-    std::size_t n) noexcept
-{
-    if(n >= *total_size)
-    {
-        while(*count > 0)
-            arr[--(*count)].~Buffer();
-        *total_size = 0;
-        return;
-    }
-
-    std::size_t i = 0;
-    while(i < *count && n > 0)
-    {
-        auto& b = arr[i];
-        if(n < b.size())
-        {
-            b += n;
-            *total_size -= n;
-            break;
-        }
-        n -= b.size();
-        *total_size -= b.size();
-        b.~Buffer();
-        ++i;
-    }
-
-    // Compact: move remaining buffers to front
-    if(i > 0)
-    {
-        std::size_t j = 0;
-        while(i < *count)
-        {
-            arr[j] = arr[i];
-            arr[i].~Buffer();
-            ++i;
-            ++j;
-        }
-        *count = j;
-    }
-}
-
-template<class Buffer>
-void
-do_keep_prefix(
-    Buffer* arr,
-    std::size_t* count,
-    std::size_t* total_size,
-    std::size_t n) noexcept
-{
-    if(n >= *total_size)
-        return;
-
-    if(n == 0)
-    {
-        while(*count > 0)
-            arr[--(*count)].~Buffer();
-        *total_size = 0;
-        return;
-    }
-
-    std::size_t remaining = n;
-    std::size_t i = 0;
-    while(i < *count && remaining > 0)
-    {
-        auto& b = arr[i];
-        if(remaining < b.size())
-        {
-            b = Buffer(b.data(), remaining);
-            ++i;
-            break;
-        }
-        remaining -= b.size();
-        ++i;
-    }
-
-    // Destruct elements beyond the new count
-    while(*count > i)
-        arr[--(*count)].~Buffer();
-
-    *total_size = n;
-}
-
-} // anonymous namespace
-
-void
-buffer_array_remove_prefix(
-    const_buffer* arr,
-    std::size_t* count,
-    std::size_t* total_size,
-    std::size_t n) noexcept
-{
-    do_remove_prefix(arr, count, total_size, n);
-}
-
-void
-buffer_array_remove_prefix(
-    mutable_buffer* arr,
-    std::size_t* count,
-    std::size_t* total_size,
-    std::size_t n) noexcept
-{
-    do_remove_prefix(arr, count, total_size, n);
-}
-
-void
-buffer_array_keep_prefix(
-    const_buffer* arr,
-    std::size_t* count,
-    std::size_t* total_size,
-    std::size_t n) noexcept
-{
-    do_keep_prefix(arr, count, total_size, n);
-}
-
-void
-buffer_array_keep_prefix(
-    mutable_buffer* arr,
-    std::size_t* count,
-    std::size_t* total_size,
-    std::size_t n) noexcept
-{
-    do_keep_prefix(arr, count, total_size, n);
-}
-
-} // namespace detail
-} // namespace capy
-} // namespace boost
diff --git a/src/buffers/buffer_pair.cpp b/src/buffers/buffer_pair.cpp
deleted file mode 100644
index 13aeef5fa..000000000
--- a/src/buffers/buffer_pair.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-//
-// Copyright (c) 2023 Vinnie Falco (vinnie.falco@gmail.com)
-//
-// Distributed under the Boost Software License, Version 1.0. (See accompanying
-// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-//
-// Official repository: https://github.com/cppalliance/capy
-//
-
-#include <boost/capy/buffers/buffer_pair.hpp>
-#include <boost/capy/buffers/slice.hpp>
-
-namespace boost {
-namespace capy {
-
-void
-tag_invoke(
-    slice_tag const&,
-    const_buffer_pair& bs,
-    slice_how how,
-    std::size_t n) noexcept
-{
-    switch(how)
-    {
-    case slice_how::remove_prefix:
-    {
-        auto p = &bs[0];
-        if(n < p->size())
-        {
-            remove_prefix(*p, n);
-            return;
-        }
-        n -= p->size();
-        *p = bs[1];
-        bs[1] = {};
-        remove_prefix(*p, n);
-        return;
-    }
-
-    case slice_how::keep_prefix:
-    {
-        auto p = &bs[0];
-        if(n <= p->size())
-        {
-            keep_prefix(*p, n);
-            bs[1] = {};
-            return;
-        }
-        n -= p->size();
-        ++p;
-        keep_prefix(*p, n);
-        return;
-    }
-    }
-}
-
-void
-tag_invoke(
-    slice_tag const&,
-    mutable_buffer_pair& bs,
-    slice_how how,
-    std::size_t n) noexcept
-{
-    switch(how)
-    {
-    case slice_how::remove_prefix:
-    {
-        auto p = &bs[0];
-        if(n < p->size())
-        {
-            remove_prefix(*p, n);
-            return;
-        }
-        n -= p->size();
-        *p = bs[1];
-        bs[1] = {};
-        remove_prefix(*p, n);
-        return;
-    }
-
-    case slice_how::keep_prefix:
-    {
-        auto p = &bs[0];
-        if(n <= p->size())
-        {
-            keep_prefix(*p, n);
-            bs[1] = {};
-            return;
-        }
-        n -= p->size();
-        ++p;
-        keep_prefix(*p, n);
-        return;
-    }
-    }
-}
-
-} // capy
-} // boost
diff --git a/src/cond.cpp b/src/cond.cpp
index 903f8844e..ba54d3045 100644
--- a/src/cond.cpp
+++ b/src/cond.cpp
@@ -33,6 +33,7 @@ message(int code) const
     case cond::canceled: return "operation canceled";
     case cond::stream_truncated: return "stream truncated";
     case cond::not_found: return "not found";
+    case cond::timeout: return "operation timed out";
     default:
         return "unknown";
     }
@@ -62,6 +63,9 @@ equivalent(
     case cond::not_found:
         return ec == capy::error::not_found;
 
+    case cond::timeout:
+        return ec == capy::error::timeout;
+
     default:
         return false;
     }
@@ -72,9 +76,9 @@ equivalent(
 // msvc 14.0 has a bug that warns about inability
 // to use constexpr construction here, even though
 // there's no constexpr construction
-#if defined(_MSC_VER) && _MSC_VER <= 1900
-# pragma warning( push )
-# pragma warning( disable : 4592 )
+#if BOOST_CAPY_WORKAROUND(_MSC_VER, <= 1900)
+BOOST_CAPY_MSVC_WARNING_PUSH
+BOOST_CAPY_MSVC_WARNING_DISABLE(4592)
 #endif
 
 #if defined(__cpp_constinit) && __cpp_constinit >= 201907L
@@ -83,8 +87,8 @@ constinit cond_cat_type cond_cat;
 cond_cat_type cond_cat;
 #endif
 
-#if defined(_MSC_VER) && _MSC_VER <= 1900
-# pragma warning( pop )
+#if BOOST_CAPY_WORKAROUND(_MSC_VER, <= 1900)
+BOOST_CAPY_MSVC_WARNING_POP
 #endif
 
 } // detail
diff --git a/src/error.cpp b/src/error.cpp
index e036c9b5e..99f5c83bf 100644
--- a/src/error.cpp
+++ b/src/error.cpp
@@ -32,6 +32,7 @@ message(int code) const
     case error::test_failure: return "test failure";
     case error::stream_truncated: return "stream truncated";
     case error::not_found: return "not found";
+    case error::timeout: return "timeout";
     default:
         return "unknown";
     }
@@ -42,9 +43,9 @@ message(int code) const
 // msvc 14.0 has a bug that warns about inability
 // to use constexpr construction here, even though
 // there's no constexpr construction
-#if defined(_MSC_VER) && _MSC_VER <= 1900
-# pragma warning( push )
-# pragma warning( disable : 4592 )
+#if BOOST_CAPY_WORKAROUND(_MSC_VER, <= 1900)
+BOOST_CAPY_MSVC_WARNING_PUSH
+BOOST_CAPY_MSVC_WARNING_DISABLE(4592)
 #endif
 
 #if defined(__cpp_constinit) && __cpp_constinit >= 201907L
@@ -53,8 +54,8 @@ constinit error_cat_type error_cat;
 error_cat_type error_cat;
 #endif
 
-#if defined(_MSC_VER) && _MSC_VER <= 1900
-# pragma warning( pop )
+#if BOOST_CAPY_WORKAROUND(_MSC_VER, <= 1900)
+BOOST_CAPY_MSVC_WARNING_POP
 #endif
 
 } // detail
diff --git a/src/ex/detail/strand_impl.hpp b/src/ex/detail/strand_impl.hpp
new file mode 100644
index 000000000..fa976b763
--- /dev/null
+++ b/src/ex/detail/strand_impl.hpp
@@ -0,0 +1,48 @@
+//
+// Copyright (c) 2025 Vinnie Falco (vinnie.falco@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_SRC_EX_DETAIL_STRAND_IMPL_HPP
+#define BOOST_CAPY_SRC_EX_DETAIL_STRAND_IMPL_HPP
+
+#include "src/ex/detail/strand_queue.hpp"
+#include <boost/capy/detail/intrusive.hpp>
+#include <atomic>
+#include <mutex>
+#include <thread>
+
+namespace boost {
+namespace capy {
+namespace detail {
+
+class strand_service_impl;
+
+/** Implementation state for a single strand.
+
+    Each strand owns one of these via shared_ptr. The mutex is borrowed
+    from the service's shared pool. The intrusive_list base links this
+    impl into the service's list of live impls for shutdown traversal.
+*/
+struct strand_impl
+    : intrusive_list<strand_impl>::node
+{
+    std::mutex* mutex_ = nullptr;
+    strand_queue pending_;
+    bool locked_ = false;
+    std::atomic<std::thread::id> dispatch_thread_{};
+
+    std::atomic<strand_service_impl*> service_{nullptr};
+
+    ~strand_impl();
+};
+
+} // namespace detail
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/src/ex/detail/strand_queue.hpp b/src/ex/detail/strand_queue.hpp
index a0461dfda..0c4a2c9c1 100644
--- a/src/ex/detail/strand_queue.hpp
+++ b/src/ex/detail/strand_queue.hpp
@@ -1,5 +1,6 @@
 //
 // Copyright (c) 2025 Vinnie Falco (vinnie.falco@gmail.com)
+// Copyright (c) 2026 Michael Vandeberg
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -10,232 +11,68 @@
 #ifndef BOOST_CAPY_SRC_EX_DETAIL_STRAND_QUEUE_HPP
 #define BOOST_CAPY_SRC_EX_DETAIL_STRAND_QUEUE_HPP
 
+#include <boost/capy/continuation.hpp>
 #include <boost/capy/detail/config.hpp>
-
-#include <coroutine>
-#include <cstddef>
-#include <exception>
+#include <boost/capy/ex/frame_allocator.hpp>
 
 namespace boost {
 namespace capy {
 namespace detail {
 
-class strand_queue;
-
-//----------------------------------------------------------
-
-// Metadata stored before the coroutine frame
-struct frame_prefix
-{
-    frame_prefix* next;
-    strand_queue* queue;
-    std::size_t alloc_size;
-};
-
-//----------------------------------------------------------
+/** Single-threaded intrusive FIFO of pending continuations.
 
-/** Wrapper coroutine for strand queue dispatch operations.
-
-    This coroutine wraps a target coroutine handle and resumes
-    it when dispatched. The wrapper ensures control returns to
-    the dispatch loop after the target suspends or completes.
-
-    The promise contains an intrusive list node for queue
-    storage and supports a custom allocator that recycles
-    coroutine frames via a free list.
-*/
-struct strand_op
-{
-    struct promise_type
-    {
-        promise_type* next = nullptr;
-
-        void*
-        operator new(
-            std::size_t size,
-            strand_queue& q,
-            std::coroutine_handle<void>);
-
-        void
-        operator delete(void* p, std::size_t);
-
-        strand_op
-        get_return_object() noexcept
-        {
-            return {std::coroutine_handle<promise_type>::from_promise(*this)};
-        }
-
-        std::suspend_always
-        initial_suspend() noexcept
-        {
-            return {};
-        }
-
-        std::suspend_always
-        final_suspend() noexcept
-        {
-            return {};
-        }
-
-        void
-        return_void() noexcept
-        {
-        }
-
-        void
-        unhandled_exception()
-        {
-            std::terminate();
-        }
-    };
-
-    std::coroutine_handle<promise_type> h_;
-};
-
-//----------------------------------------------------------
-
-/** Single-threaded dispatch queue for coroutine handles.
-
-    This queue stores coroutine handles and resumes them
-    sequentially when dispatch() is called. Each pushed
-    handle is wrapped in a strand_op coroutine that ensures
-    control returns to the dispatch loop after the target
-    suspends or completes.
-
-    The queue uses an intrusive singly-linked list through
-    the promise type to avoid separate node allocations.
-    A free list recycles wrapper coroutine frames to reduce
-    allocation overhead during repeated push/dispatch cycles.
+    Links continuations directly through `continuation::next`, so
+    push() carries no per-item allocation.
 
     @par Thread Safety
-    This class is not thread-safe. All operations must be
-    called from a single thread.
+    Not thread-safe. Caller must externally synchronize push() and
+    take_all(). dispatch_batch() does not touch queue state and may
+    run unlocked once the batch has been taken.
 */
 class strand_queue
 {
-    using promise_type = strand_op::promise_type;
-
-    promise_type* head_ = nullptr;
-    promise_type* tail_ = nullptr;
-    frame_prefix* free_list_ = nullptr;
-
-    friend struct strand_op::promise_type;
-
-    static
-    strand_op
-    make_strand_op(
-        strand_queue& q,
-        std::coroutine_handle<void> target)
-    {
-        (void)q;
-        target.resume();
-        co_return;
-    }
+    continuation* head_ = nullptr;
+    continuation* tail_ = nullptr;
 
 public:
     strand_queue() = default;
-
     strand_queue(strand_queue const&) = delete;
     strand_queue& operator=(strand_queue const&) = delete;
 
-    /** Destructor.
-
-        Destroys any pending wrappers without resuming them,
-        then frees all memory in the free list.
-    */
-    ~strand_queue()
-    {
-        // Destroy pending wrappers
-        while(head_)
-        {
-            promise_type* p = head_;
-            head_ = p->next;
-
-            auto h = std::coroutine_handle<promise_type>::from_promise(*p);
-            h.destroy();
-        }
-
-        // Free the free list memory
-        while(free_list_)
-        {
-            frame_prefix* prefix = free_list_;
-            free_list_ = prefix->next;
-            ::operator delete(prefix);
-        }
-    }
-
-    /** Returns true if there are no pending operations.
-    */
+    /** Returns true if there are no pending continuations. */
     bool
     empty() const noexcept
     {
         return head_ == nullptr;
     }
 
-    /** Push a coroutine handle to the queue.
-
-        Creates a wrapper coroutine and appends it to the
-        queue. The wrapper will resume the target handle
-        when dispatch() processes it.
+    /** Push a continuation to the queue.
 
-        @param h The coroutine handle to dispatch.
+        @param c The continuation to enqueue; see `continuation`
+            for lifetime and aliasing requirements.
     */
     void
-    push(std::coroutine_handle<void> h)
+    push(continuation& c) noexcept
     {
-        strand_op op = make_strand_op(*this, h);
-
-        promise_type* p = &op.h_.promise();
-        p->next = nullptr;
-
+        c.next = nullptr;
         if(tail_)
-            tail_->next = p;
+            tail_->next = &c;
         else
-            head_ = p;
-        tail_ = p;
-    }
-
-    /** Resume all queued coroutines in sequence.
-
-        Processes each wrapper in FIFO order, resuming its
-        target coroutine. After each target suspends or
-        completes, the wrapper is destroyed and its frame
-        is added to the free list for reuse.
-
-        Coroutines resumed during dispatch may push new
-        handles, which will also be processed in the same
-        dispatch call.
-
-        @warning Not thread-safe. Do not call while another
-            thread may be calling push().
-    */
-    void
-    dispatch()
-    {
-        while(head_)
-        {
-            promise_type* p = head_;
-            head_ = p->next;
-            if(!head_)
-                tail_ = nullptr;
-
-            auto h = std::coroutine_handle<promise_type>::from_promise(*p);
-            h.resume();
-            h.destroy();
-        }
+            head_ = &c;
+        tail_ = &c;
     }
 
     /** Batch of taken items for thread-safe dispatch. */
     struct taken_batch
     {
-        promise_type* head = nullptr;
-        promise_type* tail = nullptr;
+        continuation* head = nullptr;
+        continuation* tail = nullptr;
     };
 
     /** Take all pending items atomically.
 
-        Removes all items from the queue and returns them
-        as a batch. The queue is left empty.
+        Removes all items from the queue and returns them as a
+        batch. The queue is left empty.
 
         @return The batch of taken items.
     */
@@ -247,13 +84,16 @@ class strand_queue
         return batch;
     }
 
-    /** Dispatch a batch of taken items.
+    /** Resume each continuation in a taken batch.
+
+        Advances past each node before resuming, since the
+        resumed coroutine may destroy the awaitable (and thus
+        the continuation) before control returns here.
 
         @param batch The batch to dispatch.
 
-        @note This is thread-safe w.r.t. push() because it doesn't
-            access the queue's free_list_. Frames are deleted directly
-            rather than recycled.
+        @note Thread-safe with respect to push() because the queue
+            itself is not touched.
     */
     static
     void
@@ -261,69 +101,14 @@ class strand_queue
     {
         while(batch.head)
         {
-            promise_type* p = batch.head;
-            batch.head = p->next;
-
-            auto h = std::coroutine_handle<promise_type>::from_promise(*p);
-            h.resume();
-            // Don't use h.destroy() - it would call operator delete which
-            // accesses the queue's free_list_ (race with push).
-            // Instead, manually free the frame without recycling.
-            // h.address() returns the frame base (what operator new returned).
-            frame_prefix* prefix = static_cast<frame_prefix*>(h.address()) - 1;
-            ::operator delete(prefix);
+            continuation* c = batch.head;
+            batch.head = c->next;
+            safe_resume(c->h);
         }
         batch.tail = nullptr;
     }
 };
 
-//----------------------------------------------------------
-
-inline
-void*
-strand_op::promise_type::operator new(
-    std::size_t size,
-    strand_queue& q,
-    std::coroutine_handle<void>)
-{
-    // Total size includes prefix
-    std::size_t alloc_size = size + sizeof(frame_prefix);
-    void* raw;
-    
-    // Try to reuse from free list
-    if(q.free_list_)
-    {
-        frame_prefix* prefix = q.free_list_;
-        q.free_list_ = prefix->next;
-        raw = prefix;
-    }
-    else
-    {
-        raw = ::operator new(alloc_size);
-    }
-
-    // Initialize prefix
-    frame_prefix* prefix = static_cast<frame_prefix*>(raw);
-    prefix->next = nullptr;
-    prefix->queue = &q;
-    prefix->alloc_size = alloc_size;
-
-    // Return pointer AFTER the prefix (this is where coroutine frame goes)
-    return prefix + 1;
-}
-
-inline
-void
-strand_op::promise_type::operator delete(void* p, std::size_t)
-{
-    // Calculate back to get the prefix
-    frame_prefix* prefix = static_cast<frame_prefix*>(p) - 1;
-
-    // Add to free list
-    prefix->next = prefix->queue->free_list_;
-    prefix->queue->free_list_ = prefix;
-}
-
 } // namespace detail
 } // namespace capy
 } // namespace boost
diff --git a/src/ex/detail/strand_service.cpp b/src/ex/detail/strand_service.cpp
index 3a4b89403..a7005f4c2 100644
--- a/src/ex/detail/strand_service.cpp
+++ b/src/ex/detail/strand_service.cpp
@@ -7,139 +7,74 @@
 // Official repository: https://github.com/cppalliance/capy
 //
 
-#include "src/ex/detail/strand_queue.hpp"
+#include "src/ex/detail/strand_impl.hpp"
 #include <boost/capy/ex/detail/strand_service.hpp>
-#include <atomic>
+#include <boost/capy/continuation.hpp>
 #include <coroutine>
-#include <mutex>
-#include <thread>
+#include <memory>
 #include <utility>
 
 namespace boost {
 namespace capy {
 namespace detail {
 
-//----------------------------------------------------------
+// Sentinel stored in invoker_frame_cache_ after shutdown to prevent
+// in-flight invokers from repopulating a freed cache slot.
+inline void* const kCacheClosed = reinterpret_cast<void*>(1);
 
-/** Implementation state for a strand.
+/** Concrete strand_service.
 
-    Each strand_impl provides serialization for coroutines
-    dispatched through strands that share it.
-*/
-struct strand_impl
-{
-    std::mutex mutex_;
-    strand_queue pending_;
-    bool locked_ = false;
-    std::atomic<std::thread::id> dispatch_thread_{};
-    void* cached_frame_ = nullptr;
-};
+    Holds a shared mutex pool (193 entries), a linked list of live
+    impls (for shutdown traversal), and a single-slot invoker
+    coroutine frame cache shared across all strands of this service.
 
-//----------------------------------------------------------
-
-/** Invoker coroutine for strand dispatch.
-
-    Uses custom allocator to recycle frame - one allocation
-    per strand_impl lifetime, stored in trailer for recovery.
-*/
-struct strand_invoker
-{
-    struct promise_type
-    {
-        void* operator new(std::size_t n, strand_impl& impl)
-        {
-            constexpr auto A = alignof(strand_impl*);
-            std::size_t padded = (n + A - 1) & ~(A - 1);
-            std::size_t total = padded + sizeof(strand_impl*);
-
-            void* p = impl.cached_frame_
-                ? std::exchange(impl.cached_frame_, nullptr)
-                : ::operator new(total);
-
-            // Trailer lets delete recover impl
-            *reinterpret_cast<strand_impl**>(
-                static_cast<char*>(p) + padded) = &impl;
-            return p;
-        }
-
-        void operator delete(void* p, std::size_t n) noexcept
-        {
-            constexpr auto A = alignof(strand_impl*);
-            std::size_t padded = (n + A - 1) & ~(A - 1);
-
-            auto* impl = *reinterpret_cast<strand_impl**>(
-                static_cast<char*>(p) + padded);
-
-            if (!impl->cached_frame_)
-                impl->cached_frame_ = p;
-            else
-                ::operator delete(p);
-        }
-
-        strand_invoker get_return_object() noexcept
-        { return {std::coroutine_handle<promise_type>::from_promise(*this)}; }
-
-        std::suspend_always initial_suspend() noexcept { return {}; }
-        std::suspend_never final_suspend() noexcept { return {}; }
-        void return_void() noexcept {}
-        void unhandled_exception() { std::terminate(); }
-    };
-
-    std::coroutine_handle<promise_type> h_;
-};
-
-//----------------------------------------------------------
-
-/** Concrete implementation of strand_service.
-
-    Holds the fixed pool of strand_impl objects.
+    The dispatch helpers (`enqueue`, `dispatch_pending`, etc.) are
+    public so the namespace-scope `make_invoker` coroutine and the
+    `strand_service` static methods can call them without friendship.
 */
 class strand_service_impl : public strand_service
 {
-    static constexpr std::size_t num_impls = 211;
+public:
+    static constexpr std::size_t num_mutexes = 193;
 
-    strand_impl impls_[num_impls];
-    std::size_t salt_ = 0;
     std::mutex mutex_;
+    std::size_t salt_ = 0;
+    std::shared_ptr<std::mutex> mutexes_[num_mutexes];
+    intrusive_list<strand_impl> impl_list_;
+    std::atomic<void*> invoker_frame_cache_{nullptr};
 
-public:
     explicit
     strand_service_impl(execution_context&)
     {
     }
 
-    strand_impl*
-    get_implementation() override
+    std::shared_ptr<strand_impl>
+    create_implementation() override
     {
+        auto new_impl = std::make_shared<strand_impl>();
+
         std::lock_guard<std::mutex> lock(mutex_);
-        std::size_t index = salt_++;
-        index = index % num_impls;
-        return &impls_[index];
-    }
 
-protected:
-    void
-    shutdown() override
-    {
-        for(std::size_t i = 0; i < num_impls; ++i)
-        {
-            std::lock_guard<std::mutex> lock(impls_[i].mutex_);
-            impls_[i].locked_ = true;
-
-            if(impls_[i].cached_frame_)
-            {
-                ::operator delete(impls_[i].cached_frame_);
-                impls_[i].cached_frame_ = nullptr;
-            }
-        }
+        std::size_t s = salt_++;
+        std::size_t idx = reinterpret_cast<std::size_t>(new_impl.get());
+        idx += idx >> 3;
+        idx ^= s + 0x9e3779b9 + (idx << 6) + (idx >> 2);
+        idx %= num_mutexes;
+        if(!mutexes_[idx])
+            mutexes_[idx] = std::make_shared<std::mutex>();
+        new_impl->mutex_ = mutexes_[idx].get();
+
+        impl_list_.push_back(new_impl.get());
+        new_impl->service_.store(this, std::memory_order_release);
+
+        return new_impl;
     }
 
-private:
     static bool
-    enqueue(strand_impl& impl, std::coroutine_handle<> h)
+    enqueue(strand_impl& impl, continuation& c)
     {
-        std::lock_guard<std::mutex> lock(impl.mutex_);
-        impl.pending_.push(h);
+        std::lock_guard<std::mutex> lock(*impl.mutex_);
+        impl.pending_.push(c);
         if(!impl.locked_)
         {
             impl.locked_ = true;
@@ -153,7 +88,7 @@ class strand_service_impl : public strand_service
     {
         strand_queue::taken_batch batch;
         {
-            std::lock_guard<std::mutex> lock(impl.mutex_);
+            std::lock_guard<std::mutex> lock(*impl.mutex_);
             batch = impl.pending_.take_all();
         }
         impl.pending_.dispatch_batch(batch);
@@ -162,7 +97,7 @@ class strand_service_impl : public strand_service
     static bool
     try_unlock(strand_impl& impl)
     {
-        std::lock_guard<std::mutex> lock(impl.mutex_);
+        std::lock_guard<std::mutex> lock(*impl.mutex_);
         if(impl.pending_.empty())
         {
             impl.locked_ = false;
@@ -183,28 +118,135 @@ class strand_service_impl : public strand_service
         impl.dispatch_thread_.store(std::thread::id{});
     }
 
-    // Loops until queue empty (aggressive). Alternative: per-batch fairness
-    // (repost after each batch to let other work run) - explore if starvation observed.
-    static strand_invoker
-    make_invoker(strand_impl& impl)
+    // Defined below; needs strand_invoker complete.
+    static void
+    post_invoker(std::shared_ptr<strand_impl> impl, executor_ref ex);
+
+protected:
+    void
+    shutdown() override
     {
-        strand_impl* p = &impl;
-        for(;;)
+        std::lock_guard<std::mutex> lock(mutex_);
+        while(auto* p = impl_list_.pop_front())
         {
-            set_dispatch_thread(*p);
-            dispatch_pending(*p);
-            if(try_unlock(*p))
-            {
-                clear_dispatch_thread(*p);
-                co_return;
-            }
+            std::lock_guard<std::mutex> impl_lock(*p->mutex_);
+            p->locked_ = true;
+            p->service_.store(nullptr, std::memory_order_release);
         }
+
+        void* fp = invoker_frame_cache_.exchange(
+            kCacheClosed, std::memory_order_acq_rel);
+        if(fp) ::operator delete(fp);
     }
+};
+
+/** Invoker coroutine that drains a strand's pending queue.
+
+    Runs once the strand transitions from unlocked to locked. Holds
+    the impl alive via the coroutine parameter (a shared_ptr in the
+    coroutine frame), so user code may drop its strand handle while
+    the invoker is mid-flight.
+
+    The frame's allocator recycles a single per-service slot. The
+    trailer points at the service (lifetime: execution_context),
+    NOT the impl (lifetime: per-strand), so operator delete is
+    safe even after the impl has been destroyed.
+*/
+struct strand_invoker
+{
+    struct promise_type
+    {
+        // Stored in the coroutine frame so its address is stable for
+        // posting to the inner executor.
+        continuation self_;
+
+        void*
+        operator new(
+            std::size_t n,
+            std::shared_ptr<strand_impl> const& impl)
+        {
+            auto* svc = impl->service_.load(std::memory_order_acquire);
+            constexpr auto A = alignof(strand_service_impl*);
+            std::size_t padded = (n + A - 1) & ~(A - 1);
+            std::size_t total = padded + sizeof(strand_service_impl*);
+
+            void* p = svc->invoker_frame_cache_.exchange(
+                nullptr, std::memory_order_acquire);
+            if(!p || p == kCacheClosed)
+                p = ::operator new(total);
+
+            *reinterpret_cast<strand_service_impl**>(
+                static_cast<char*>(p) + padded) = svc;
+            return p;
+        }
+
+        void
+        operator delete(void* p, std::size_t n) noexcept
+        {
+            constexpr auto A = alignof(strand_service_impl*);
+            std::size_t padded = (n + A - 1) & ~(A - 1);
+            auto* svc = *reinterpret_cast<strand_service_impl**>(
+                static_cast<char*>(p) + padded);
 
-    friend class strand_service;
+            void* expected = nullptr;
+            if(!svc->invoker_frame_cache_.compare_exchange_strong(
+                    expected, p, std::memory_order_release))
+                ::operator delete(p);
+        }
+
+        strand_invoker
+        get_return_object() noexcept
+        {
+            return {std::coroutine_handle<promise_type>::from_promise(*this)};
+        }
+
+        std::suspend_always initial_suspend() noexcept { return {}; }
+        std::suspend_never  final_suspend()   noexcept { return {}; }
+        void return_void() noexcept {}
+        void unhandled_exception() { std::terminate(); }
+    };
+
+    std::coroutine_handle<promise_type> h_;
 };
 
-//----------------------------------------------------------
+// The by-value parameter lives in the coroutine frame for the
+// invoker's lifetime, keeping the impl alive past any user-side
+// strand drop.
+static
+strand_invoker
+make_invoker(std::shared_ptr<strand_impl> impl)
+{
+    auto* p = impl.get();
+    for(;;)
+    {
+        strand_service_impl::set_dispatch_thread(*p);
+        strand_service_impl::dispatch_pending(*p);
+        if(strand_service_impl::try_unlock(*p))
+        {
+            strand_service_impl::clear_dispatch_thread(*p);
+            co_return;
+        }
+    }
+}
+
+void
+strand_service_impl::post_invoker(
+    std::shared_ptr<strand_impl> impl,
+    executor_ref ex)
+{
+    auto invoker = make_invoker(std::move(impl));
+    auto& self = invoker.h_.promise().self_;
+    self.h = invoker.h_;
+    ex.post(self);
+}
+
+strand_impl::~strand_impl()
+{
+    auto* svc = service_.load(std::memory_order_acquire);
+    if(!svc) return;
+    std::lock_guard<std::mutex> lock(svc->mutex_);
+    svc->impl_list_.remove(this);
+}
 
 strand_service::
 strand_service()
@@ -224,22 +266,28 @@ running_in_this_thread(strand_impl& impl) noexcept
 
 std::coroutine_handle<>
 strand_service::
-dispatch(strand_impl& impl, executor_ref ex, std::coroutine_handle<> h)
+dispatch(
+    std::shared_ptr<strand_impl> const& impl,
+    executor_ref ex,
+    continuation& c)
 {
-    if(running_in_this_thread(impl))
-        return h;
+    if(running_in_this_thread(*impl))
+        return c.h;
 
-    if(strand_service_impl::enqueue(impl, h))
-        ex.post(strand_service_impl::make_invoker(impl).h_);
+    if(strand_service_impl::enqueue(*impl, c))
+        strand_service_impl::post_invoker(impl, ex);
     return std::noop_coroutine();
 }
 
 void
 strand_service::
-post(strand_impl& impl, executor_ref ex, std::coroutine_handle<> h)
+post(
+    std::shared_ptr<strand_impl> const& impl,
+    executor_ref ex,
+    continuation& c)
 {
-    if(strand_service_impl::enqueue(impl, h))
-        ex.post(strand_service_impl::make_invoker(impl).h_);
+    if(strand_service_impl::enqueue(*impl, c))
+        strand_service_impl::post_invoker(impl, ex);
 }
 
 strand_service&
diff --git a/src/ex/detail/timer_service.cpp b/src/ex/detail/timer_service.cpp
new file mode 100644
index 000000000..995a6d3e0
--- /dev/null
+++ b/src/ex/detail/timer_service.cpp
@@ -0,0 +1,125 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#include <boost/capy/ex/detail/timer_service.hpp>
+
+namespace boost {
+namespace capy {
+namespace detail {
+
+timer_service::
+timer_service(execution_context& ctx)
+    : thread_([this] { run(); })
+{
+    (void)ctx;
+}
+
+timer_service::
+~timer_service()
+{
+    stop_and_join();
+}
+
+timer_service::timer_id
+timer_service::
+schedule_at(
+    std::chrono::steady_clock::time_point deadline,
+    std::function<void()> cb)
+{
+    std::lock_guard lock(mutex_);
+    auto id = ++next_id_;
+    active_ids_.insert(id);
+    queue_.push(entry{deadline, id, std::move(cb)});
+    cv_.notify_one();
+    return id;
+}
+
+void
+timer_service::
+cancel(timer_id id)
+{
+    std::unique_lock lock(mutex_);
+    if(!active_ids_.contains(id))
+        return;
+    if(executing_id_ == id)
+    {
+        // Callback is running — wait for it to finish.
+        // run() erases from active_ids_ after execution.
+        while(executing_id_ == id)
+            cancel_cv_.wait(lock);
+        return;
+    }
+    active_ids_.erase(id);
+}
+
+void
+timer_service::
+stop_and_join()
+{
+    {
+        std::lock_guard lock(mutex_);
+        stopped_ = true;
+    }
+    cv_.notify_one();
+    if(thread_.joinable())
+        thread_.join();
+}
+
+void
+timer_service::
+shutdown()
+{
+    stop_and_join();
+}
+
+void
+timer_service::
+run()
+{
+    std::unique_lock lock(mutex_);
+    for(;;)
+    {
+        if(stopped_)
+            return;
+
+        if(queue_.empty())
+        {
+            cv_.wait(lock);
+            continue;
+        }
+
+        auto deadline = queue_.top().deadline;
+        auto now = std::chrono::steady_clock::now();
+        if(deadline > now)
+        {
+            cv_.wait_until(lock, deadline);
+            continue;
+        }
+
+        // Pop the entry (const_cast needed because priority_queue::top is const)
+        auto e = std::move(const_cast<entry&>(queue_.top()));
+        queue_.pop();
+
+        // Skip if cancelled (no longer in active set)
+        if(!active_ids_.contains(e.id))
+            continue;
+
+        executing_id_ = e.id;
+        lock.unlock();
+        e.callback();
+        lock.lock();
+        active_ids_.erase(e.id);
+        executing_id_ = 0;
+        cancel_cv_.notify_all();
+    }
+}
+
+} // detail
+} // capy
+} // boost
diff --git a/src/ex/thread_pool.cpp b/src/ex/thread_pool.cpp
index a5a0c1e76..393a397c9 100644
--- a/src/ex/thread_pool.cpp
+++ b/src/ex/thread_pool.cpp
@@ -9,8 +9,11 @@
 //
 
 #include <boost/capy/ex/thread_pool.hpp>
-#include <boost/capy/detail/intrusive.hpp>
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/detail/thread_local_ptr.hpp>
+#include <boost/capy/ex/frame_allocator.hpp>
 #include <boost/capy/test/thread_name.hpp>
+#include <algorithm>
 #include <atomic>
 #include <condition_variable>
 #include <cstdio>
@@ -21,18 +24,25 @@
 /*
     Thread pool implementation using a shared work queue.
 
-    Work items are coroutine handles wrapped in intrusive list nodes, stored
-    in a single queue protected by a mutex. Worker threads wait on a
-    condition_variable until work is available or stop is requested.
+    Work items are continuations linked via their intrusive next pointer,
+    stored in a single queue protected by a mutex. No per-post heap
+    allocation: the continuation is owned by the caller and linked
+    directly. Worker threads wait on a condition_variable until work
+    is available or stop is requested.
 
     Threads are started lazily on first post() via std::call_once to avoid
     spawning threads for pools that are constructed but never used. Each
     thread is named with a configurable prefix plus index for debugger
     visibility.
 
-    Shutdown sequence: stop() sets the stop flag and notifies all threads,
-    then the destructor joins threads and destroys any remaining queued
-    work without executing it.
+    Work tracking: on_work_started/on_work_finished maintain an atomic
+    outstanding_work_ counter. join() blocks until this counter reaches
+    zero, then signals workers to stop and joins threads.
+
+    Two shutdown paths:
+    - join(): waits for outstanding work to drain, then stops workers.
+    - stop(): immediately signals workers to exit; queued work is abandoned.
+    - Destructor: stop() then join() (abandon + wait for threads).
 */
 
 namespace boost {
@@ -42,56 +52,85 @@ namespace capy {
 
 class thread_pool::impl
 {
-    struct work : detail::intrusive_queue<work>::node
-    {
-        std::coroutine_handle<> h_;
+    // Identifies the pool owning the current worker thread, or
+    // nullptr if the calling thread is not a pool worker. Checked
+    // by dispatch() to decide between symmetric transfer (inline
+    // resume) and post.
+    static inline detail::thread_local_ptr<impl const> current_;
 
-        explicit work(std::coroutine_handle<> h) noexcept
-            : h_(h)
-        {
-        }
+    // Intrusive queue of continuations via continuation::next.
+    // No per-post allocation: the continuation is owned by the caller.
+    continuation* head_ = nullptr;
+    continuation* tail_ = nullptr;
 
-        void run()
-        {
-            auto h = h_;
-            delete this;
-            h.resume();
-        }
+    void push(continuation* c) noexcept
+    {
+        c->next = nullptr;
+        if(tail_)
+            tail_->next = c;
+        else
+            head_ = c;
+        tail_ = c;
+    }
 
-        void destroy()
-        {
-            delete this;
-        }
-    };
+    continuation* pop() noexcept
+    {
+        if(!head_)
+            return nullptr;
+        continuation* c = head_;
+        head_ = head_->next;
+        if(!head_)
+            tail_ = nullptr;
+        return c;
+    }
+
+    bool empty() const noexcept
+    {
+        return head_ == nullptr;
+    }
 
     std::mutex mutex_;
-    std::condition_variable cv_;
-    detail::intrusive_queue<work> q_;
+    std::condition_variable work_cv_;
+    std::condition_variable done_cv_;
     std::vector<std::thread> threads_;
-    std::atomic<bool> stop_{false};
+    std::atomic<std::size_t> outstanding_work_{0};
+    bool stop_{false};
+    bool joined_{false};
     std::size_t num_threads_;
     char thread_name_prefix_[13]{};  // 12 chars max + null terminator
     std::once_flag start_flag_;
 
 public:
-    ~impl()
+    ~impl() = default;
+
+    bool
+    running_in_this_thread() const noexcept
     {
-        stop();
-        for(auto& t : threads_)
-            if(t.joinable())
-                t.join();
+        return current_.get() == this;
+    }
 
-        while(auto* w = q_.pop())
-            w->destroy();
+    // Destroy abandoned coroutine frames. Must be called
+    // before execution_context::shutdown()/destroy() so
+    // that suspended-frame destructors (e.g. delay_awaitable
+    // calling timer_service::cancel()) run while services
+    // are still valid.
+    void
+    drain_abandoned() noexcept
+    {
+        while(auto* c = pop())
+        {
+            auto h = c->h;
+            if(h && h != std::noop_coroutine())
+                h.destroy();
+        }
     }
 
     impl(std::size_t num_threads, std::string_view thread_name_prefix)
         : num_threads_(num_threads)
     {
         if(num_threads_ == 0)
-            num_threads_ = std::thread::hardware_concurrency();
-        if(num_threads_ == 0)
-            num_threads_ = 1;
+            num_threads_ = std::max(
+                std::thread::hardware_concurrency(), 1u);
 
         // Truncate prefix to 12 chars, leaving room for up to 3-digit index.
         auto n = thread_name_prefix.copy(thread_name_prefix_, 12);
@@ -99,22 +138,73 @@ class thread_pool::impl
     }
 
     void
-    post(std::coroutine_handle<> h)
+    post(continuation& c)
     {
         ensure_started();
-        auto* w = new work(h);
         {
             std::lock_guard<std::mutex> lock(mutex_);
-            q_.push(w);
+            push(&c);
+        }
+        work_cv_.notify_one();
+    }
+
+    void
+    on_work_started() noexcept
+    {
+        outstanding_work_.fetch_add(1, std::memory_order_acq_rel);
+    }
+
+    void
+    on_work_finished() noexcept
+    {
+        if(outstanding_work_.fetch_sub(
+            1, std::memory_order_acq_rel) == 1)
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            if(joined_ && !stop_)
+                stop_ = true;
+            done_cv_.notify_all();
+            work_cv_.notify_all();
         }
-        cv_.notify_one();
+    }
+
+    void
+    join() noexcept
+    {
+        {
+            std::unique_lock<std::mutex> lock(mutex_);
+            if(joined_)
+                return;
+            joined_ = true;
+
+            if(outstanding_work_.load(
+                std::memory_order_acquire) == 0)
+            {
+                stop_ = true;
+                work_cv_.notify_all();
+            }
+            else
+            {
+                done_cv_.wait(lock, [this]{
+                    return stop_;
+                });
+            }
+        }
+
+        for(auto& t : threads_)
+            if(t.joinable())
+                t.join();
     }
 
     void
     stop() noexcept
     {
-        stop_.store(true, std::memory_order_release);
-        cv_.notify_all();
+        {
+            std::lock_guard<std::mutex> lock(mutex_);
+            stop_ = true;
+        }
+        work_cv_.notify_all();
+        done_cv_.notify_all();
     }
 
 private:
@@ -136,21 +226,29 @@ class thread_pool::impl
         std::snprintf(name, sizeof(name), "%s%zu", thread_name_prefix_, index);
         set_current_thread_name(name);
 
+        // Mark this thread as a worker of this pool so dispatch()
+        // can symmetric-transfer when called from within pool work.
+        struct scoped_pool
+        {
+            scoped_pool(impl const* p) noexcept { current_.set(p); }
+            ~scoped_pool() noexcept { current_.set(nullptr); }
+        } guard(this);
+
         for(;;)
         {
-            work* w = nullptr;
+            continuation* c = nullptr;
             {
                 std::unique_lock<std::mutex> lock(mutex_);
-                cv_.wait(lock, [this]{
-                    return !q_.empty() ||
-                        stop_.load(std::memory_order_acquire);
+                work_cv_.wait(lock, [this]{
+                    return !empty() ||
+                        stop_;
                 });
-                if(stop_.load(std::memory_order_acquire) && q_.empty())
+                if(stop_)
                     return;
-                w = q_.pop();
+                c = pop();
             }
-            if(w)
-                w->run();
+            if(c)
+                safe_resume(c->h);
         }
     }
 };
@@ -160,6 +258,9 @@ class thread_pool::impl
 thread_pool::
 ~thread_pool()
 {
+    impl_->stop();
+    impl_->join();
+    impl_->drain_abandoned();
     shutdown();
     destroy();
     delete impl_;
@@ -172,6 +273,13 @@ thread_pool(std::size_t num_threads, std::string_view thread_name_prefix)
     this->set_frame_allocator(std::allocator<void>{});
 }
 
+void
+thread_pool::
+join() noexcept
+{
+    impl_->join();
+}
+
 void
 thread_pool::
 stop() noexcept
@@ -181,11 +289,43 @@ stop() noexcept
 
 //------------------------------------------------------------------------------
 
+thread_pool::executor_type
+thread_pool::
+get_executor() const noexcept
+{
+    return executor_type(
+        const_cast<thread_pool&>(*this));
+}
+
+void
+thread_pool::executor_type::
+on_work_started() const noexcept
+{
+    pool_->impl_->on_work_started();
+}
+
+void
+thread_pool::executor_type::
+on_work_finished() const noexcept
+{
+    pool_->impl_->on_work_finished();
+}
+
 void
 thread_pool::executor_type::
-post(std::coroutine_handle<> h) const
+post(continuation& c) const
+{
+    pool_->impl_->post(c);
+}
+
+std::coroutine_handle<>
+thread_pool::executor_type::
+dispatch(continuation& c) const
 {
-    pool_->impl_->post(h);
+    if(pool_->impl_->running_in_this_thread())
+        return c.h;
+    pool_->impl_->post(c);
+    return std::noop_coroutine();
 }
 
 } // capy
diff --git a/src/test/run_blocking.cpp b/src/test/run_blocking.cpp
index b9cb9eb89..143dc5a26 100644
--- a/src/test/run_blocking.cpp
+++ b/src/test/run_blocking.cpp
@@ -9,6 +9,7 @@
 
 #include <boost/capy/test/run_blocking.hpp>
 
+#include <boost/capy/ex/frame_allocator.hpp>
 #include <condition_variable>
 #include <mutex>
 #include <queue>
@@ -76,7 +77,7 @@ blocking_context::run()
             h = impl_->queue.front();
             impl_->queue.pop();
         }
-        h.resume();
+        safe_resume(h);
     }
     if(impl_->ep)
         std::rethrow_exception(impl_->ep);
@@ -86,10 +87,8 @@ void
 blocking_context::enqueue(
     std::coroutine_handle<> h)
 {
-    {
-        std::lock_guard<std::mutex> lock(impl_->mtx);
-        impl_->queue.push(h);
-    }
+    std::lock_guard<std::mutex> lock(impl_->mtx);
+    impl_->queue.push(h);
     impl_->cv.notify_one();
 }
 
@@ -120,16 +119,16 @@ blocking_executor::on_work_finished() const noexcept
 
 std::coroutine_handle<>
 blocking_executor::dispatch(
-    std::coroutine_handle<> h) const
+    continuation& c) const
 {
-    return h;
+    return c.h;
 }
 
 void
 blocking_executor::post(
-    std::coroutine_handle<> h) const
+    continuation& c) const
 {
-    ctx_->enqueue(h);
+    ctx_->enqueue(c.h);
 }
 
 } // namespace test
diff --git a/test/cmake_test/CMakeLists.txt b/test/cmake_test/CMakeLists.txt
index c992aea32..b343cbdb6 100644
--- a/test/cmake_test/CMakeLists.txt
+++ b/test/cmake_test/CMakeLists.txt
@@ -6,7 +6,7 @@
 # https://www.boost.org/LICENSE_1_0.txt
 #
 
-cmake_minimum_required(VERSION 3.5...3.16)
+cmake_minimum_required(VERSION 3.13...3.31)
 
 project(cmake_subdir_test LANGUAGES CXX)
 set(__ignore__ ${CMAKE_C_COMPILER})
@@ -16,11 +16,13 @@ if(BOOST_CI_INSTALL_TEST)
   find_package(Boost CONFIG REQUIRED COMPONENTS capy)
 else()
   set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
-  add_subdirectory(../.. boostorg/capy)
+  set(BOOST_INCLUDE_LIBRARIES capy)
+  add_subdirectory(../../../.. deps/boost EXCLUDE_FROM_ALL)
 endif()
 
 add_executable(main main.cpp)
 target_link_libraries(main Boost::capy)
+target_compile_features(main PRIVATE cxx_std_20)
 
 enable_testing()
 add_test(NAME main COMMAND main)
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 0d744adfb..14774345b 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -7,10 +7,6 @@
 # Official repository: https://github.com/cppalliance/capy
 #
 
-if(NOT TARGET boost_capy_test_suite)
-    add_subdirectory(../../extra/test_suite test_suite)
-endif()
-
 file(GLOB_RECURSE PFILES CONFIGURE_DEPENDS *.cpp *.hpp)
 list(APPEND PFILES
     CMakeLists.txt
@@ -21,7 +17,7 @@ source_group(TREE ${CMAKE_CURRENT_SOURCE_DIR} PREFIX "" FILES ${PFILES})
 add_executable(boost_capy_tests ${PFILES})
 target_link_libraries(
     boost_capy_tests PRIVATE
-    boost_capy_test_suite_main
+    Boost::capy_test_suite_main
     Boost::capy)
 
 target_include_directories(boost_capy_tests PRIVATE . ../../)
diff --git a/test/unit/buffers/asio.cpp b/test/unit/buffers/asio.cpp
index 2be261807..c978f75ce 100644
--- a/test/unit/buffers/asio.cpp
+++ b/test/unit/buffers/asio.cpp
@@ -11,7 +11,7 @@
 
 #include <boost/capy/buffers.hpp>
 #include <boost/capy/buffers/asio.hpp>
-#include <boost/capy/buffers/buffer_array.hpp>
+#include <boost/capy/detail/buffer_array.hpp>
 #include <boost/capy/buffers/buffer_copy.hpp>
 #include <boost/asio/buffer.hpp>
 
@@ -28,9 +28,9 @@ namespace capy {
 // to_asio result satisfies asio buffer sequence traits
 
 using to_asio_const_t = decltype(
-    to_asio(std::declval<const_buffer_array<4> const&>()));
+    to_asio(std::declval<detail::const_buffer_array<4> const&>()));
 using to_asio_mutable_t = decltype(
-    to_asio(std::declval<mutable_buffer_array<4> const&>()));
+    to_asio(std::declval<detail::mutable_buffer_array<4> const&>()));
 
 static_assert(asio::is_const_buffer_sequence<to_asio_const_t>::value);
 static_assert(asio::is_const_buffer_sequence<to_asio_mutable_t>::value);
@@ -117,8 +117,8 @@ struct asio_test
     {
         char d1[] = "abc";
         char d2[] = "defgh";
-        mutable_buffer_array<4> bufs;
-        bufs = mutable_buffer_array<4>(
+        detail::mutable_buffer_array<4> bufs;
+        bufs = detail::mutable_buffer_array<4>(
             std::array<mutable_buffer, 2>{{
                 mutable_buffer(d1, 3),
                 mutable_buffer(d2, 5)
@@ -292,8 +292,8 @@ struct asio_test
     test_move_semantics()
     {
         char d1[] = "abc";
-        mutable_buffer_array<4> bufs;
-        bufs = mutable_buffer_array<4>(
+        detail::mutable_buffer_array<4> bufs;
+        bufs = detail::mutable_buffer_array<4>(
             mutable_buffer(d1, 3));
 
         auto adapted = to_asio(std::move(bufs));
diff --git a/test/unit/buffers/buffer.cpp b/test/unit/buffers/buffer.cpp
index d03a29afe..dfda2e296 100644
--- a/test/unit/buffers/buffer.cpp
+++ b/test/unit/buffers/buffer.cpp
@@ -12,6 +12,7 @@
 
 #include <boost/capy.hpp>
 #include <array>
+#include <ranges>
 #include <span>
 
 #include "test_buffers.hpp"
@@ -19,6 +20,8 @@
 namespace boost {
 namespace capy {
 
+// Buffer Sequence Concepts
+
 static_assert(  ConstBufferSequence<const_buffer>);
 static_assert(  ConstBufferSequence<mutable_buffer>);
 static_assert(! MutableBufferSequence<const_buffer>);
@@ -49,6 +52,96 @@ static_assert(  ConstBufferSequence<mutable_buffer[3]>);
 static_assert(! MutableBufferSequence<const_buffer[3]>);
 static_assert(  MutableBufferSequence<mutable_buffer[3]>);
 
+// std::ranges concepts for span<const_buffer>
+
+static_assert(std::ranges::range<std::span<const_buffer>>);
+static_assert(std::ranges::input_range<std::span<const_buffer>>);
+static_assert(std::ranges::forward_range<std::span<const_buffer>>);
+static_assert(std::ranges::bidirectional_range<std::span<const_buffer>>);
+static_assert(std::ranges::random_access_range<std::span<const_buffer>>);
+static_assert(std::ranges::contiguous_range<std::span<const_buffer>>);
+
+// std::ranges concepts for span<mutable_buffer>
+
+static_assert(std::ranges::range<std::span<mutable_buffer>>);
+static_assert(std::ranges::input_range<std::span<mutable_buffer>>);
+static_assert(std::ranges::forward_range<std::span<mutable_buffer>>);
+static_assert(std::ranges::bidirectional_range<std::span<mutable_buffer>>);
+static_assert(std::ranges::random_access_range<std::span<mutable_buffer>>);
+static_assert(std::ranges::contiguous_range<std::span<mutable_buffer>>);
+
+// std::ranges concepts for array<const_buffer, N>
+
+static_assert(std::ranges::range<std::array<const_buffer, 3>>);
+static_assert(std::ranges::input_range<std::array<const_buffer, 3>>);
+static_assert(std::ranges::forward_range<std::array<const_buffer, 3>>);
+static_assert(std::ranges::bidirectional_range<std::array<const_buffer, 3>>);
+static_assert(std::ranges::random_access_range<std::array<const_buffer, 3>>);
+static_assert(std::ranges::contiguous_range<std::array<const_buffer, 3>>);
+
+// std::ranges concepts for array<mutable_buffer, N>
+
+static_assert(std::ranges::range<std::array<mutable_buffer, 3>>);
+static_assert(std::ranges::input_range<std::array<mutable_buffer, 3>>);
+static_assert(std::ranges::forward_range<std::array<mutable_buffer, 3>>);
+static_assert(std::ranges::bidirectional_range<std::array<mutable_buffer, 3>>);
+static_assert(std::ranges::random_access_range<std::array<mutable_buffer, 3>>);
+static_assert(std::ranges::contiguous_range<std::array<mutable_buffer, 3>>);
+
+// std::ranges concepts for 2-element buffer arrays
+
+static_assert(std::ranges::range<std::array<const_buffer, 2>>);
+static_assert(std::ranges::bidirectional_range<std::array<const_buffer, 2>>);
+static_assert(std::ranges::random_access_range<std::array<const_buffer, 2>>);
+
+static_assert(std::ranges::range<std::array<mutable_buffer, 2>>);
+static_assert(std::ranges::bidirectional_range<std::array<mutable_buffer, 2>>);
+static_assert(std::ranges::random_access_range<std::array<mutable_buffer, 2>>);
+
+// std::views producing valid ConstBufferSequence
+
+using span_cb = std::span<const_buffer>;
+using span_mb = std::span<mutable_buffer>;
+
+// take_view preserves bidirectional + value type
+using take_cb = decltype(std::declval<span_cb>() | std::views::take(1));
+static_assert(std::ranges::bidirectional_range<take_cb>);
+static_assert(std::is_convertible_v<std::ranges::range_value_t<take_cb>, const_buffer>);
+static_assert(ConstBufferSequence<take_cb>);
+
+using take_mb = decltype(std::declval<span_mb>() | std::views::take(1));
+static_assert(std::ranges::bidirectional_range<take_mb>);
+static_assert(MutableBufferSequence<take_mb>);
+
+// drop_view preserves bidirectional + value type
+using drop_cb = decltype(std::declval<span_cb>() | std::views::drop(1));
+static_assert(std::ranges::bidirectional_range<drop_cb>);
+static_assert(ConstBufferSequence<drop_cb>);
+
+using drop_mb = decltype(std::declval<span_mb>() | std::views::drop(1));
+static_assert(std::ranges::bidirectional_range<drop_mb>);
+static_assert(MutableBufferSequence<drop_mb>);
+
+// reverse_view preserves bidirectional + value type
+using rev_cb = decltype(std::declval<span_cb>() | std::views::reverse);
+static_assert(std::ranges::bidirectional_range<rev_cb>);
+static_assert(ConstBufferSequence<rev_cb>);
+
+using rev_mb = decltype(std::declval<span_mb>() | std::views::reverse);
+static_assert(std::ranges::bidirectional_range<rev_mb>);
+static_assert(MutableBufferSequence<rev_mb>);
+
+// filter_view is bidirectional but not const-iterable;
+// it satisfies ConstBufferSequence for non-const lvalue
+// but the buffer APIs take const& so filter_view cannot
+// be used directly with buffer_size, buffer_copy, etc.
+using filt_cb = decltype(
+    std::declval<span_cb>()
+        | std::views::filter([](const_buffer b) { return b.size() > 0; }));
+static_assert(std::ranges::bidirectional_range<filt_cb>);
+static_assert(ConstBufferSequence<filt_cb>);
+static_assert(!ConstBufferSequence<filt_cb const>);
+
 namespace {
 
 // test fixture
@@ -85,9 +178,9 @@ struct fixt<mutable_buffer>
 };
 
 template<>
-struct fixt<const_buffer_pair>
+struct fixt<std::array<const_buffer, 2>>
 {
-    const_buffer_pair t;
+    std::array<const_buffer, 2> t;
     fixt(std::string_view pat)
         : t{{ {buf(pat.substr(0, 3))}, {buf(pat.substr(3))} }}
     {
@@ -95,10 +188,10 @@ struct fixt<const_buffer_pair>
 };
 
 template<>
-struct fixt<mutable_buffer_pair>
+struct fixt<std::array<mutable_buffer, 2>>
 {
     char data[64];
-    mutable_buffer_pair t;
+    std::array<mutable_buffer, 2> t;
     fixt(std::string_view pat)
         : t{{{data,3}, {data+3, pat.size()-3}}}
     {
@@ -223,7 +316,8 @@ struct buffer_test
             char data[64];
             mutable_buffer mb(data, sizeof(data));
             fixt<T> f(pat);
-            keep_prefix(mb, buffer_copy(mb, f.t));
+            auto const n = buffer_copy(mb, f.t);
+            mb = mutable_buffer(mb.data(), n);
             BOOST_TEST_EQ(test::make_string(mb), pat);
         }
     }
@@ -232,8 +326,8 @@ struct buffer_test
     {
         testBuffer<const_buffer>();
         testBuffer<mutable_buffer>();
-        testBuffer<const_buffer_pair>();
-        testBuffer<mutable_buffer_pair>();
+        testBuffer<std::array<const_buffer, 2>>();
+        testBuffer<std::array<mutable_buffer, 2>>();
         testBuffer<std::span<const_buffer,3>>();
         testBuffer<std::span<mutable_buffer,3>>();
         testBuffer<std::array<const_buffer,3>>();
@@ -390,17 +484,17 @@ struct buffer_test
 
         // empty buffer_pair (both empty)
         {
-            const_buffer_pair cbp{{ {data, 0}, {data, 0} }};
+            std::array<const_buffer, 2> cbp{{ {data, 0}, {data, 0} }};
             BOOST_TEST(buffer_empty(cbp));
         }
 
         // non-empty buffer_pair (one non-empty)
         {
-            const_buffer_pair cbp{{ {data, 0}, {data, 3} }};
+            std::array<const_buffer, 2> cbp{{ {data, 0}, {data, 3} }};
             BOOST_TEST(! buffer_empty(cbp));
         }
         {
-            const_buffer_pair cbp{{ {data, 3}, {data, 0} }};
+            std::array<const_buffer, 2> cbp{{ {data, 3}, {data, 0} }};
             BOOST_TEST(! buffer_empty(cbp));
         }
 
@@ -444,6 +538,45 @@ struct buffer_test
         }
     }
 
+    void testViews()
+    {
+        char data[9] = "ABCDEFGH";
+        const_buffer cb[3] = {
+            { data, 3 },
+            { data + 3, 3 },
+            { data + 6, 2 }
+        };
+        std::span<const_buffer> bufs(cb, 3);
+
+        // take: first 2 buffers = "ABCDEF"
+        {
+            auto v = bufs | std::views::take(2);
+            BOOST_TEST_EQ(buffer_size(v), 6u);
+            BOOST_TEST_EQ(test::make_string(v), "ABCDEF");
+        }
+
+        // drop: skip first buffer = "DEFGH"
+        {
+            auto v = bufs | std::views::drop(1);
+            BOOST_TEST_EQ(buffer_size(v), 5u);
+            BOOST_TEST_EQ(test::make_string(v), "DEFGH");
+        }
+
+        // reverse: buffers in reverse order = "GHDEFABC"
+        {
+            auto v = bufs | std::views::reverse;
+            BOOST_TEST_EQ(buffer_size(v), 8u);
+            BOOST_TEST_EQ(test::make_string(v), "GHDEFABC");
+        }
+
+        // take + drop composition = middle buffer only
+        {
+            auto v = bufs | std::views::drop(1) | std::views::take(1);
+            BOOST_TEST_EQ(buffer_size(v), 3u);
+            BOOST_TEST_EQ(test::make_string(v), "DEF");
+        }
+    }
+
     void run()
     {
         testBuffers();
@@ -451,6 +584,7 @@ struct buffer_test
         testMutableBuffer();
         testSize();
         testEmpty();
+        testViews();
     }
 };
 
diff --git a/test/unit/buffers/buffer_copy.cpp b/test/unit/buffers/buffer_copy.cpp
index c289d36aa..01a80130f 100644
--- a/test/unit/buffers/buffer_copy.cpp
+++ b/test/unit/buffers/buffer_copy.cpp
@@ -10,7 +10,7 @@
 // Test that header file is self-contained.
 #include <boost/capy/buffers/buffer_copy.hpp>
 
-#include <boost/capy/buffers/buffer_pair.hpp>
+#include <array>
 #include <span>
 #include "test_buffers.hpp"
 
@@ -34,12 +34,12 @@ struct buffer_copy_test
                 for(std::size_t k = 0;
                     k < N + 2; ++k)
                 {
-                    const_buffer_pair p0{{
+                    std::array<const_buffer, 2> p0{{
                         const_buffer(s.data(), i),
                         const_buffer(s.data() + i, N - i) }};
                     char tmp[13];
                     std::memset(tmp, 0, sizeof(tmp));
-                    mutable_buffer_pair p1{{
+                    std::array<mutable_buffer, 2> p1{{
                         mutable_buffer(tmp, j),
                         mutable_buffer(tmp + j, N - j) }};
                     auto const n = buffer_copy(
diff --git a/test/unit/buffers/buffer_pair.cpp b/test/unit/buffers/buffer_pair.cpp
deleted file mode 100644
index 83f30d108..000000000
--- a/test/unit/buffers/buffer_pair.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-//
-// Copyright (c) 2023 Vinnie Falco (vinnie.falco@gmail.com)
-//
-// Distributed under the Boost Software License, Version 1.0. (See accompanying
-// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-//
-// Official repository: https://github.com/cppalliance/capy
-//
-
-// Test that header file is self-contained.
-#include <boost/capy/buffers/buffer_pair.hpp>
-
-#include "test_buffers.hpp"
-
-namespace boost {
-namespace capy {
-
-struct buffer_pair_test
-{
-    void
-    testConstPair()
-    {
-        auto const& pat = test_pattern();
-
-        // const_buffer_pair()
-        {
-            const_buffer_pair cb;
-            BOOST_TEST_EQ(
-                buffer_size(cb), 0);
-        }
-
-        // const_buffer_pair(
-        //  const_buffer_pair const&),
-        // const_buffer_pair(
-        //  const_buffer const&)
-        //  const_buffer const&)
-        {
-            for(std::size_t i = 0;
-                i <= pat.size(); ++i)
-            {
-                const_buffer_pair cb0({{
-                    { &pat[0], i },
-                    { &pat[i], pat.size() - i }}});
-                const_buffer_pair cb1(cb0);
-                BOOST_TEST_EQ(
-                    test::make_string(cb0), pat);
-                BOOST_TEST_EQ(
-                    test::make_string(cb0),
-                    test::make_string(cb1));
-                BOOST_TEST_EQ(
-                    cb0[0].data(), cb1[0].data());
-                BOOST_TEST_EQ(
-                    cb0[1].size(), cb1[1].size());
-                auto const& ccb0 = cb0;
-                auto const& ccb1 = cb1;
-                BOOST_TEST_EQ(
-                    ccb0[0].data(), ccb1[0].data());
-                BOOST_TEST_EQ(
-                    ccb0[1].size(), ccb1[1].size());
-            }
-        }
-
-        // operator=(const_buffer_pair const&)
-        {
-            for(std::size_t i = 0;
-                i <= pat.size(); ++i)
-            {
-                const_buffer_pair cb0({{
-                    { &pat[0], i },
-                    { &pat[i], pat.size() - i }}});
-                const_buffer_pair cb1;
-                cb1 = cb0;
-                BOOST_TEST_EQ(
-                    test::make_string(cb0), pat);
-                BOOST_TEST_EQ(
-                    test::make_string(cb0),
-                    test::make_string(cb1));
-            }
-        }
-
-        {
-            for(std::size_t i = 0;
-                i <= pat.size(); ++i)
-            {
-                const_buffer_pair cb({{
-                    { &pat[0], i },
-                    { &pat[i], pat.size() - i }}});
-                test::check_sequence(cb, pat);
-            }
-        }
-    }
-
-    void
-    testMutablePair()
-    {
-        std::string pat = test_pattern();
-
-        // mutable_buffer_pair()
-        {
-            mutable_buffer_pair mb;
-            BOOST_TEST_EQ(buffer_size(mb), 0);
-        }
-
-        // mutable_buffer_pair(
-        //  mutable_buffer_pair const&),
-        // mutable_buffer_pair(
-        //  mutable_buffer const&)
-        //  mutable_buffer const&)
-        {
-            for(std::size_t i = 0;
-                i <= pat.size(); ++i)
-            {
-                mutable_buffer_pair mb0({{
-                    { &pat[0], i },
-                    { &pat[i], pat.size() - i }}});
-                mutable_buffer_pair mb1(mb0);
-                BOOST_TEST_EQ(
-                    test::make_string(mb0), pat);
-                BOOST_TEST_EQ(
-                    test::make_string(mb0),
-                    test::make_string(mb1));
-                BOOST_TEST_EQ(
-                    mb0[0].data(), mb1[0].data());
-                BOOST_TEST_EQ(
-                    mb0[1].size(), mb1[1].size());
-                auto const& cmb0 = mb0;
-                auto const& cmb1 = mb1;
-                BOOST_TEST_EQ(
-                    cmb0[0].data(), cmb1[0].data());
-                BOOST_TEST_EQ(
-                    cmb0[1].size(), cmb1[1].size());
-            }
-        }
-
-        // operator=(mutable_buffer_pair const&)
-        {
-            for(std::size_t i = 0;
-                i <= pat.size(); ++i)
-            {
-                mutable_buffer_pair mb0({{
-                    { &pat[0], i },
-                    { &pat[i], pat.size() - i }}});
-                mutable_buffer_pair mb1;
-                mb1 = mb0;
-                BOOST_TEST_EQ(
-                    test::make_string(mb0), pat);
-                BOOST_TEST_EQ(
-                    test::make_string(mb0),
-                    test::make_string(mb1));
-            }
-        }
-
-        // operator=(mutable_buffer_pair const&)
-        {
-            for(std::size_t i = 0;
-                i <= pat.size(); ++i)
-            {
-                auto s = pat;
-                mutable_buffer_pair b({{
-                    { &s[0], i },
-                    { &s[i], s.size() - i }}});
-                mutable_buffer_pair mb;
-                mb = b;
-                BOOST_TEST_EQ(
-                    test::make_string(mb), pat);
-                BOOST_TEST_EQ(
-                    test::make_string(mb),
-                    test::make_string(b));
-            }
-        }
-
-        {
-            for(std::size_t i = 0;
-                i <= pat.size(); ++i)
-            {
-                mutable_buffer_pair cb({{
-                    { &pat[0], i },
-                    { &pat[i], pat.size() - i }}});
-                test::check_sequence(cb, pat);
-            }
-        }
-    }
-
-    void
-    run()
-    {
-        testConstPair();
-        testMutablePair();
-    }
-};
-
-TEST_SUITE(
-    buffer_pair_test,
-    "boost.capy.buffers.buffer_pair");
-
-} // capy
-} // boost
diff --git a/test/unit/buffers/buffer_slice.cpp b/test/unit/buffers/buffer_slice.cpp
new file mode 100644
index 000000000..371e1246b
--- /dev/null
+++ b/test/unit/buffers/buffer_slice.cpp
@@ -0,0 +1,423 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+// Test that headers are self-contained.
+#include <boost/capy/buffers/buffer_slice.hpp>
+#include <boost/capy/concept/slice.hpp>
+#include <boost/capy/detail/slice_impl.hpp>
+
+#include <boost/capy/buffers.hpp>
+
+#include <array>
+#include <cstring>
+#include <ranges>
+#include <string>
+
+#include "test_suite.hpp"
+
+namespace boost {
+namespace capy {
+
+namespace {
+
+// Flatten the bytes exposed by a Slice's data() into a std::string for
+// byte-exact comparison.
+template<class Slice>
+std::string flatten(Slice const& s)
+{
+    std::string out;
+    auto view = s.data();
+    for (auto it = view.begin(); it != view.end(); ++it)
+    {
+        auto buf = *it;
+        out.append(
+            static_cast<char const*>(buf.data()),
+            buf.size());
+    }
+    return out;
+}
+
+} // anonymous namespace
+
+struct buffer_slice_test
+{
+    void
+    testConceptModeled()
+    {
+        char a[10];
+        std::array<mutable_buffer, 1> mbufs = {
+            mutable_buffer(a, sizeof(a))
+        };
+        std::array<const_buffer, 1> cbufs = {
+            const_buffer(a, sizeof(a))
+        };
+        using m_slice = detail::slice_impl<decltype(mbufs)>;
+        using c_slice = detail::slice_impl<decltype(cbufs)>;
+
+        // Both satisfy Slice
+        static_assert(Slice<m_slice>,
+            "mutable-input slice_impl must satisfy Slice");
+        static_assert(Slice<c_slice>,
+            "const-input slice_impl must satisfy Slice");
+
+        // Only the mutable-input one satisfies MutableSlice
+        static_assert(MutableSlice<m_slice>,
+            "mutable-input slice_impl must satisfy MutableSlice");
+        static_assert(!MutableSlice<c_slice>,
+            "const-input slice_impl must NOT satisfy MutableSlice");
+    }
+
+    void
+    testNotABufferSequence()
+    {
+        char a[10];
+        std::array<mutable_buffer, 1> bufs = {
+            mutable_buffer(a, sizeof(a))
+        };
+        using slice_t = detail::slice_impl<decltype(bufs)>;
+        static_assert(
+            !ConstBufferSequence<slice_t>,
+            "slice_impl must not model ConstBufferSequence");
+        static_assert(
+            !MutableBufferSequence<slice_t>,
+            "slice_impl must not model MutableBufferSequence");
+    }
+
+    void
+    testDataIsBufferSequence()
+    {
+        char a[10];
+        std::array<mutable_buffer, 1> bufs = {
+            mutable_buffer(a, sizeof(a))
+        };
+        detail::slice_impl<decltype(bufs)> s(bufs);
+        using data_t = decltype(s.data());
+        static_assert(
+            MutableBufferSequence<data_t>,
+            "data() return must satisfy MutableBufferSequence "
+            "when input is mutable");
+        static_assert(
+            ConstBufferSequence<data_t>,
+            "data() return must satisfy ConstBufferSequence");
+        static_assert(
+            std::ranges::bidirectional_range<data_t>,
+            "data() return must be a bidirectional_range");
+    }
+
+    void
+    testWholeSequenceCtor()
+    {
+        char a[10];
+        char b[20];
+        std::memset(a, 'A', sizeof(a));
+        std::memset(b, 'B', sizeof(b));
+        std::array<mutable_buffer, 2> bufs = {
+            mutable_buffer(a, sizeof(a)),
+            mutable_buffer(b, sizeof(b))
+        };
+        detail::slice_impl<decltype(bufs)> s(bufs);
+        BOOST_TEST_EQ(buffer_size(s.data()), sizeof(a) + sizeof(b));
+
+        std::string const expected =
+            std::string(sizeof(a), 'A') + std::string(sizeof(b), 'B');
+        BOOST_TEST_EQ(flatten(s), expected);
+    }
+
+    void
+    testOffsetLengthCtor()
+    {
+        char a[10];
+        char b[20];
+        std::memset(a, 'A', sizeof(a));
+        std::memset(b, 'B', sizeof(b));
+        std::array<mutable_buffer, 2> bufs = {
+            mutable_buffer(a, sizeof(a)),
+            mutable_buffer(b, sizeof(b))
+        };
+        using slice_t = detail::slice_impl<decltype(bufs)>;
+
+        // offset=0, length=total -> whole sequence
+        {
+            slice_t s(bufs, 0, 30);
+            BOOST_TEST_EQ(buffer_size(s.data()), 30u);
+            BOOST_TEST_EQ(flatten(s),
+                std::string(10, 'A') + std::string(20, 'B'));
+        }
+
+        // offset inside first buffer (front trim, no back trim)
+        {
+            slice_t s(bufs, 3, 27);
+            BOOST_TEST_EQ(buffer_size(s.data()), 27u);
+            BOOST_TEST_EQ(flatten(s),
+                std::string(7, 'A') + std::string(20, 'B'));
+        }
+
+        // offset past first buffer, length terminating inside last (front + back)
+        {
+            slice_t s(bufs, 12, 5);
+            BOOST_TEST_EQ(buffer_size(s.data()), 5u);
+            BOOST_TEST_EQ(flatten(s), std::string(5, 'B'));
+        }
+
+        // both offset and length inside first buffer
+        {
+            slice_t s(bufs, 2, 4);
+            BOOST_TEST_EQ(buffer_size(s.data()), 4u);
+            BOOST_TEST_EQ(flatten(s), std::string(4, 'A'));
+        }
+
+        // offset=0, length=0 -> empty
+        {
+            slice_t s(bufs, 0, 0);
+            BOOST_TEST_EQ(buffer_size(s.data()), 0u);
+            BOOST_TEST_EQ(flatten(s), std::string());
+        }
+
+        // offset >= total -> empty (no UB)
+        {
+            slice_t s(bufs, 50, 10);
+            BOOST_TEST_EQ(buffer_size(s.data()), 0u);
+        }
+
+        // length > total - offset -> clamped to remainder
+        {
+            slice_t s(bufs, 5, 999);
+            BOOST_TEST_EQ(buffer_size(s.data()), 25u);
+            BOOST_TEST_EQ(flatten(s),
+                std::string(5, 'A') + std::string(20, 'B'));
+        }
+    }
+
+    void
+    testRemovePrefix()
+    {
+        char a[10];
+        char b[20];
+        std::memset(a, 'A', sizeof(a));
+        std::memset(b, 'B', sizeof(b));
+        std::array<mutable_buffer, 2> bufs = {
+            mutable_buffer(a, sizeof(a)),
+            mutable_buffer(b, sizeof(b))
+        };
+        using slice_t = detail::slice_impl<decltype(bufs)>;
+
+        // remove within first buffer
+        {
+            slice_t s(bufs);
+            s.remove_prefix(3);
+            BOOST_TEST_EQ(buffer_size(s.data()), 27u);
+            BOOST_TEST_EQ(flatten(s),
+                std::string(7, 'A') + std::string(20, 'B'));
+        }
+
+        // remove exactly to end of first buffer
+        {
+            slice_t s(bufs);
+            s.remove_prefix(10);
+            BOOST_TEST_EQ(buffer_size(s.data()), 20u);
+            BOOST_TEST_EQ(flatten(s), std::string(20, 'B'));
+        }
+
+        // remove crossing into second buffer
+        {
+            slice_t s(bufs);
+            s.remove_prefix(15);
+            BOOST_TEST_EQ(buffer_size(s.data()), 15u);
+            BOOST_TEST_EQ(flatten(s), std::string(15, 'B'));
+        }
+
+        // remove all
+        {
+            slice_t s(bufs);
+            s.remove_prefix(30);
+            BOOST_TEST_EQ(buffer_size(s.data()), 0u);
+        }
+
+        // remove more than total -> empty, no UB
+        {
+            slice_t s(bufs);
+            s.remove_prefix(1000);
+            BOOST_TEST_EQ(buffer_size(s.data()), 0u);
+        }
+    }
+
+    void
+    testRemovePrefixOnLengthCapped()
+    {
+        // Verify remove_prefix walks correctly through a slice that has
+        // back_skip_ set by an offset/length constructor.
+        char a[5];
+        char b[5];
+        char c[5];
+        std::memset(a, 'a', sizeof(a));
+        std::memset(b, 'b', sizeof(b));
+        std::memset(c, 'c', sizeof(c));
+        std::array<mutable_buffer, 3> bufs = {
+            mutable_buffer(a, sizeof(a)),
+            mutable_buffer(b, sizeof(b)),
+            mutable_buffer(c, sizeof(c))
+        };
+        using slice_t = detail::slice_impl<decltype(bufs)>;
+
+        // bytes 2..12 -> [3 'a' + 5 'b' + 2 'c']
+        slice_t s(bufs, 2, 10);
+        BOOST_TEST_EQ(buffer_size(s.data()), 10u);
+        BOOST_TEST_EQ(flatten(s),
+            std::string(3, 'a') + std::string(5, 'b') + std::string(2, 'c'));
+
+        // remove 4 -> [4 'b' + 2 'c'] (consumed 3 'a' + 1 'b')
+        s.remove_prefix(4);
+        BOOST_TEST_EQ(buffer_size(s.data()), 6u);
+        BOOST_TEST_EQ(flatten(s),
+            std::string(4, 'b') + std::string(2, 'c'));
+
+        // remove 5 -> [1 'c'] (consumed 4 'b' + 1 'c')
+        s.remove_prefix(5);
+        BOOST_TEST_EQ(buffer_size(s.data()), 1u);
+        BOOST_TEST_EQ(flatten(s), std::string(1, 'c'));
+
+        // remove 1 -> empty
+        s.remove_prefix(1);
+        BOOST_TEST_EQ(buffer_size(s.data()), 0u);
+    }
+
+    void
+    testEmpty()
+    {
+        // default-constructed slice
+        detail::slice_impl<std::array<mutable_buffer, 0>> s{};
+        BOOST_TEST_EQ(buffer_size(s.data()), 0u);
+        s.remove_prefix(5);
+        BOOST_TEST_EQ(buffer_size(s.data()), 0u);
+    }
+
+    void
+    testMutableVsConst()
+    {
+        char a[10];
+        std::array<mutable_buffer, 1> mbufs = {
+            mutable_buffer(a, sizeof(a))
+        };
+        std::array<const_buffer, 1> cbufs = {
+            const_buffer(a, sizeof(a))
+        };
+        using m_slice = detail::slice_impl<decltype(mbufs)>;
+        using c_slice = detail::slice_impl<decltype(cbufs)>;
+
+        static_assert(
+            std::is_same_v<m_slice::buffer_type, mutable_buffer>,
+            "mutable input -> mutable buffer_type");
+        static_assert(
+            std::is_same_v<c_slice::buffer_type, const_buffer>,
+            "const input -> const buffer_type");
+
+        m_slice ms(mbufs);
+        c_slice cs(cbufs);
+        BOOST_TEST_EQ(buffer_size(ms.data()), 10u);
+        BOOST_TEST_EQ(buffer_size(cs.data()), 10u);
+    }
+
+    void
+    testSingleBuffer()
+    {
+        char a[10];
+        std::memset(a, 'X', sizeof(a));
+        mutable_buffer mb(a, sizeof(a));
+
+        detail::slice_impl<mutable_buffer> s(mb);
+        BOOST_TEST_EQ(buffer_size(s.data()), 10u);
+        BOOST_TEST_EQ(flatten(s), std::string(10, 'X'));
+
+        s.remove_prefix(3);
+        BOOST_TEST_EQ(buffer_size(s.data()), 7u);
+        BOOST_TEST_EQ(flatten(s), std::string(7, 'X'));
+    }
+
+    void
+    testPublicFunction()
+    {
+        char a[10];
+        char b[20];
+        std::memset(a, 'A', sizeof(a));
+        std::memset(b, 'B', sizeof(b));
+        std::array<mutable_buffer, 2> bufs = {
+            mutable_buffer(a, sizeof(a)),
+            mutable_buffer(b, sizeof(b))
+        };
+
+        // default args: whole sequence
+        {
+            auto s = buffer_slice(bufs);
+            static_assert(Slice<decltype(s)>,
+                "buffer_slice's return must satisfy Slice");
+            static_assert(MutableSlice<decltype(s)>,
+                "buffer_slice over mutable input must satisfy MutableSlice");
+            BOOST_TEST_EQ(buffer_size(s.data()), 30u);
+            BOOST_TEST_EQ(flatten(s),
+                std::string(10, 'A') + std::string(20, 'B'));
+        }
+
+        // const input -> Slice but not MutableSlice
+        {
+            std::array<const_buffer, 1> cbufs = {
+                const_buffer(a, sizeof(a))
+            };
+            auto s = buffer_slice(cbufs);
+            static_assert(Slice<decltype(s)>,
+                "buffer_slice over const input must satisfy Slice");
+            static_assert(!MutableSlice<decltype(s)>,
+                "buffer_slice over const input must NOT satisfy MutableSlice");
+            BOOST_TEST_EQ(buffer_size(s.data()), 10u);
+        }
+
+        // offset + length
+        {
+            auto s = buffer_slice(bufs, 5, 10);
+            BOOST_TEST_EQ(buffer_size(s.data()), 10u);
+            BOOST_TEST_EQ(flatten(s),
+                std::string(5, 'A') + std::string(5, 'B'));
+        }
+
+        // offset only (length defaults to "to end")
+        {
+            auto s = buffer_slice(bufs, 12);
+            BOOST_TEST_EQ(buffer_size(s.data()), 18u);
+            BOOST_TEST_EQ(flatten(s), std::string(18, 'B'));
+        }
+
+        // single buffer
+        {
+            mutable_buffer mb(a, sizeof(a));
+            auto s = buffer_slice(mb, 2, 5);
+            BOOST_TEST_EQ(buffer_size(s.data()), 5u);
+            BOOST_TEST_EQ(flatten(s), std::string(5, 'A'));
+        }
+    }
+
+    void
+    run()
+    {
+        testConceptModeled();
+        testNotABufferSequence();
+        testDataIsBufferSequence();
+        testWholeSequenceCtor();
+        testOffsetLengthCtor();
+        testRemovePrefix();
+        testRemovePrefixOnLengthCapped();
+        testEmpty();
+        testMutableVsConst();
+        testSingleBuffer();
+        testPublicFunction();
+    }
+};
+
+TEST_SUITE(buffer_slice_test, "boost.capy.buffer_slice");
+
+} // namespace capy
+} // namespace boost
diff --git a/test/unit/buffers/circular_dynamic_buffer.cpp b/test/unit/buffers/circular_dynamic_buffer.cpp
index 02cae5bc7..9657ceced 100644
--- a/test/unit/buffers/circular_dynamic_buffer.cpp
+++ b/test/unit/buffers/circular_dynamic_buffer.cpp
@@ -15,6 +15,11 @@
 #include "test/unit/test_dynamic_buffer.hpp"
 #include "test_buffers.hpp"
 
+#include <algorithm>
+#include <cstring>
+#include <random>
+#include <vector>
+
 namespace boost {
 namespace capy {
 
@@ -100,6 +105,347 @@ struct circular_dynamic_buffer_test
         }
     }
 
+    // Helper: total size of a 2-element buffer pair
+    static std::size_t
+    bp_total_size(std::array<const_buffer, 2> const& bp) noexcept
+    {
+        return bp[0].size() + bp[1].size();
+    }
+
+    static std::size_t
+    bp_total_size(std::array<mutable_buffer, 2> const& bp) noexcept
+    {
+        return bp[0].size() + bp[1].size();
+    }
+
+    // Helper: write a string into the buffer via prepare/commit
+    static void
+    write_string(
+        circular_dynamic_buffer& cb,
+        char const* s,
+        std::size_t len)
+    {
+        auto mb = cb.prepare(len);
+        std::size_t copied = 0;
+        if(mb[0].size() > 0)
+        {
+            auto n = (std::min)(mb[0].size(), len);
+            std::memcpy(mb[0].data(), s, n);
+            copied += n;
+        }
+        if(mb[1].size() > 0 && copied < len)
+        {
+            auto n = (std::min)(mb[1].size(), len - copied);
+            std::memcpy(mb[1].data(), s + copied, n);
+            copied += n;
+        }
+        cb.commit(len);
+    }
+
+    // Helper: read all readable bytes into a string
+    static std::string
+    read_string(circular_dynamic_buffer const& cb)
+    {
+        auto d = cb.data();
+        std::string result;
+        result.append(
+            static_cast<char const*>(d[0].data()),
+            d[0].size());
+        result.append(
+            static_cast<char const*>(d[1].data()),
+            d[1].size());
+        return result;
+    }
+
+    void
+    testDataWrapped()
+    {
+        char buf[8];
+        circular_dynamic_buffer cb{buf, 8};
+
+        write_string(cb, "ABCDEF", 6);
+        cb.consume(5);
+        BOOST_TEST(cb.size() == 1);
+
+        write_string(cb, "GHIJK", 5);
+        BOOST_TEST(cb.size() == 6);
+
+        auto d = cb.data();
+        BOOST_TEST(d[0].size() == 3);
+        BOOST_TEST(d[1].size() == 3);
+        BOOST_TEST(bp_total_size(d) == 6);
+
+        std::string s = read_string(cb);
+        BOOST_TEST(s == "FGHIJK");
+    }
+
+    void
+    testPrepareTooLargeWithExistingData()
+    {
+        char buf[16];
+        circular_dynamic_buffer cb{buf, 16};
+        write_string(cb, "ABCDE", 5);
+        BOOST_TEST_THROWS(cb.prepare(12), std::length_error);
+        auto mb = cb.prepare(11);
+        BOOST_TEST(bp_total_size(mb) == 11);
+    }
+
+    void
+    testPrepareWrapped()
+    {
+        char buf[8];
+        circular_dynamic_buffer cb{buf, 8};
+
+        // Partial consume keeps in_pos_ at 5
+        write_string(cb, "ABCDEF", 6);
+        cb.consume(5);
+
+        // pos=(5+1)%8=6, 6+5=11>8 => wraps
+        auto mb = cb.prepare(5);
+        BOOST_TEST(mb[0].size() == 2);
+        BOOST_TEST(mb[1].size() == 3);
+        BOOST_TEST(bp_total_size(mb) == 5);
+    }
+
+    void
+    testCommitMoreThanPrepared()
+    {
+        char buf[32];
+        circular_dynamic_buffer cb{buf, 32};
+        cb.prepare(10);
+        cb.commit(100);
+        BOOST_TEST(cb.size() == 10);
+    }
+
+    void
+    testCommitZero()
+    {
+        char buf[32];
+        circular_dynamic_buffer cb{buf, 32};
+        cb.prepare(10);
+        cb.commit(0);
+        BOOST_TEST(cb.size() == 0);
+    }
+
+    void
+    testCommitClearsOutSize()
+    {
+        char buf[32];
+        circular_dynamic_buffer cb{buf, 32};
+        cb.prepare(10);
+        cb.commit(5);
+        cb.commit(5);
+        BOOST_TEST(cb.size() == 5);
+    }
+
+    void
+    testConsumeMoreThanSize()
+    {
+        char buf[32];
+        circular_dynamic_buffer cb{buf, 32};
+        write_string(cb, "ABC", 3);
+        cb.consume(100);
+        BOOST_TEST(cb.size() == 0);
+    }
+
+    void
+    testConsumeZero()
+    {
+        char buf[32];
+        circular_dynamic_buffer cb{buf, 32};
+        write_string(cb, "ABCDE", 5);
+        cb.consume(0);
+        BOOST_TEST(cb.size() == 5);
+        BOOST_TEST(read_string(cb) == "ABCDE");
+    }
+
+    void
+    testConsumeAllWithPreparedBuffer()
+    {
+        char buf[16];
+        circular_dynamic_buffer cb{buf, 16};
+        write_string(cb, "ABCDE", 5);
+        cb.prepare(5);
+        cb.consume(5);
+        BOOST_TEST(cb.size() == 0);
+        cb.commit(3);
+        BOOST_TEST(cb.size() == 3);
+    }
+
+    void
+    testConsumeAllNoPrepareResetsPos()
+    {
+        char buf[16];
+        circular_dynamic_buffer cb{buf, 16};
+
+        write_string(cb, "ABCDE", 5);
+        cb.consume(3);
+        cb.consume(2);
+        BOOST_TEST(cb.size() == 0);
+
+        auto mb = cb.prepare(16);
+        BOOST_TEST(mb[0].size() == 16);
+        BOOST_TEST(mb[1].size() == 0);
+    }
+
+    void
+    testWrapAroundRoundTrip()
+    {
+        char buf[8];
+        circular_dynamic_buffer cb{buf, 8};
+
+        // Partial consume to keep in_pos_ at 6
+        write_string(cb, "ABCDEFG", 7);
+        cb.consume(6);
+
+        write_string(cb, "123456", 6);
+        BOOST_TEST(cb.size() == 7);
+
+        auto d = cb.data();
+        // in_pos_=6, in_len_=7 => wraps
+        BOOST_TEST(d[0].size() == 2);
+        BOOST_TEST(d[1].size() == 5);
+        BOOST_TEST(read_string(cb) == "G123456");
+
+        cb.consume(3);
+        BOOST_TEST(cb.size() == 4);
+        BOOST_TEST(read_string(cb) == "3456");
+    }
+
+    void
+    testCapacityOne()
+    {
+        char buf[1];
+        circular_dynamic_buffer cb{buf, 1};
+        BOOST_TEST(cb.max_size() == 1);
+
+        write_string(cb, "X", 1);
+        BOOST_TEST(cb.size() == 1);
+        BOOST_TEST(read_string(cb) == "X");
+
+        cb.consume(1);
+        BOOST_TEST(cb.size() == 0);
+
+        BOOST_TEST_THROWS(cb.prepare(2), std::length_error);
+    }
+
+    void
+    testPrepareZero()
+    {
+        char buf[16];
+        circular_dynamic_buffer cb{buf, 16};
+        auto mb = cb.prepare(0);
+        BOOST_TEST(bp_total_size(mb) == 0);
+        cb.commit(0);
+        BOOST_TEST(cb.size() == 0);
+    }
+
+    void
+    testMultipleCycles()
+    {
+        char buf[10];
+        circular_dynamic_buffer cb{buf, 10};
+
+        for(int cycle = 0; cycle < 20; ++cycle)
+        {
+            std::string msg = "C";
+            msg += std::to_string(cycle % 10);
+            auto len = msg.size();
+            BOOST_TEST(len <= 10);
+            write_string(cb, msg.c_str(), len);
+            BOOST_TEST(read_string(cb) == msg);
+            cb.consume(len);
+            BOOST_TEST(cb.size() == 0);
+        }
+    }
+
+    void
+    testFuzz()
+    {
+        constexpr std::size_t cap = 64;
+        char buf[cap];
+        circular_dynamic_buffer cb{buf, cap};
+
+        std::vector<unsigned char> model;
+
+        std::mt19937 rng{42};
+        std::uniform_int_distribution<int> action_dist{0, 2};
+        std::uniform_int_distribution<int> byte_dist{0, 255};
+
+        for(int iter = 0; iter < 2000; ++iter)
+        {
+            int action = action_dist(rng);
+
+            if(action == 0)
+            {
+                std::size_t avail = cap - model.size();
+                if(avail == 0)
+                    continue;
+                std::uniform_int_distribution<std::size_t> sz_dist{1, avail};
+                std::size_t n = sz_dist(rng);
+
+                std::vector<unsigned char> data(n);
+                for(auto& b : data)
+                    b = static_cast<unsigned char>(byte_dist(rng));
+
+                auto mb = cb.prepare(n);
+                std::size_t copied = 0;
+                if(mb[0].size() > 0)
+                {
+                    auto chunk = (std::min)(mb[0].size(), n);
+                    std::memcpy(mb[0].data(), data.data(), chunk);
+                    copied += chunk;
+                }
+                if(mb[1].size() > 0 && copied < n)
+                {
+                    auto chunk = (std::min)(mb[1].size(), n - copied);
+                    std::memcpy(mb[1].data(), data.data() + copied, chunk);
+                    copied += chunk;
+                }
+                cb.commit(n);
+                model.insert(model.end(), data.begin(), data.end());
+            }
+            else if(action == 1)
+            {
+                if(model.empty())
+                    continue;
+                std::uniform_int_distribution<std::size_t> sz_dist{1, model.size()};
+                std::size_t n = sz_dist(rng);
+                cb.consume(n);
+                model.erase(model.begin(), model.begin() + static_cast<std::ptrdiff_t>(n));
+            }
+            else
+            {
+                BOOST_TEST(cb.size() == model.size());
+                auto d = cb.data();
+                BOOST_TEST(bp_total_size(d) == model.size());
+
+                std::string actual = read_string(cb);
+                std::string expected(model.begin(), model.end());
+                BOOST_TEST(actual == expected);
+            }
+        }
+
+        BOOST_TEST(cb.size() == model.size());
+        std::string actual = read_string(cb);
+        std::string expected(model.begin(), model.end());
+        BOOST_TEST(actual == expected);
+    }
+
+    void
+    testCommitPartialThenPrepare()
+    {
+        char buf[16];
+        circular_dynamic_buffer cb{buf, 16};
+
+        cb.prepare(10);
+        cb.commit(4);
+        BOOST_TEST(cb.size() == 4);
+
+        auto mb = cb.prepare(12);
+        BOOST_TEST(bp_total_size(mb) == 12);
+    }
+
     void
     testGrind()
     {
@@ -116,6 +462,23 @@ struct circular_dynamic_buffer_test
     {
         testMembers();
         testGrind();
+
+        testDataWrapped();
+        testPrepareTooLargeWithExistingData();
+        testPrepareWrapped();
+        testPrepareZero();
+        testCommitMoreThanPrepared();
+        testCommitZero();
+        testCommitClearsOutSize();
+        testCommitPartialThenPrepare();
+        testConsumeMoreThanSize();
+        testConsumeZero();
+        testConsumeAllWithPreparedBuffer();
+        testConsumeAllNoPrepareResetsPos();
+        testWrapAroundRoundTrip();
+        testCapacityOne();
+        testMultipleCycles();
+        testFuzz();
     }
 };
 
diff --git a/test/unit/buffers/consuming_buffers.cpp b/test/unit/buffers/consuming_buffers.cpp
deleted file mode 100644
index b04b5b6ed..000000000
--- a/test/unit/buffers/consuming_buffers.cpp
+++ /dev/null
@@ -1,136 +0,0 @@
-//
-// Copyright (c) 2025 Vinnie Falco (vinnie.falco@gmail.com)
-//
-// Distributed under the Boost Software License, Version 1.0. (See accompanying
-// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-//
-// Official repository: https://github.com/cppalliance/capy
-//
-
-// Test that header file is self-contained.
-#include <boost/capy/buffers/consuming_buffers.hpp>
-
-#include <boost/capy/buffers.hpp>
-
-#include <array>
-#include <concepts>
-#include <iterator>
-#include <ranges>
-
-#include "test_suite.hpp"
-
-namespace boost {
-namespace capy {
-
-//------------------------------------------------
-// consuming_buffers tests
-// Focus: verify consuming_buffers models buffer sequence concept
-//------------------------------------------------
-
-struct consuming_buffers_test
-{
-    void
-    testBufferSequenceConcept()
-    {
-        char buf1[100];
-        char buf2[200];
-        std::array<mutable_buffer, 2> bufs = {
-            mutable_buffer(buf1, sizeof(buf1)),
-            mutable_buffer(buf2, sizeof(buf2))
-        };
-
-        consuming_buffers<decltype(bufs)> cb(bufs);
-
-        // Verify consuming_buffers models mutable_buffer_sequence
-        static_assert(
-            MutableBufferSequence<consuming_buffers<decltype(bufs)>>,
-            "consuming_buffers must model mutable_buffer_sequence");
-
-        // Verify it can be used with buffer_size
-        std::size_t const size = buffer_size(cb);
-        BOOST_TEST_EQ(size, sizeof(buf1) + sizeof(buf2));
-    }
-
-    void
-    testSingleBuffer()
-    {
-        char buf[100];
-        mutable_buffer mbuf(buf, sizeof(buf));
-
-        consuming_buffers<mutable_buffer> cb(mbuf);
-
-        // Verify consuming_buffers models mutable_buffer_sequence for single buffer
-        static_assert(
-            MutableBufferSequence<consuming_buffers<mutable_buffer>>,
-            "consuming_buffers must model mutable_buffer_sequence for single buffer");
-
-        std::size_t const size = buffer_size(cb);
-        BOOST_TEST_EQ(size, sizeof(buf));
-    }
-
-    void
-    testRangeConcepts()
-    {
-        char buf1[100];
-        char buf2[200];
-        std::array<mutable_buffer, 2> bufs = {
-            mutable_buffer(buf1, sizeof(buf1)),
-            mutable_buffer(buf2, sizeof(buf2))
-        };
-
-        using cb_type = consuming_buffers<decltype(bufs)>;
-
-        // Most general to most specific - Range Concepts
-        static_assert(std::ranges::range<cb_type>,
-            "consuming_buffers must satisfy std::ranges::range");
-        static_assert(std::ranges::input_range<cb_type>,
-            "consuming_buffers must satisfy std::ranges::input_range");
-        static_assert(std::ranges::forward_range<cb_type>,
-            "consuming_buffers must satisfy std::ranges::forward_range");
-        static_assert(std::ranges::bidirectional_range<cb_type>,
-            "consuming_buffers must satisfy std::ranges::bidirectional_range");
-
-        // Most general to most specific - Iterator Concepts
-        using iter_t = std::ranges::iterator_t<cb_type>;
-        static_assert(std::input_iterator<iter_t>,
-            "consuming_buffers iterator must satisfy std::input_iterator");
-        static_assert(std::forward_iterator<iter_t>,
-            "consuming_buffers iterator must satisfy std::forward_iterator");
-        static_assert(std::bidirectional_iterator<iter_t>,
-            "consuming_buffers iterator must satisfy std::bidirectional_iterator");
-
-        // Iterator traits check
-        using traits = std::iterator_traits<iter_t>;
-        static_assert(std::same_as<typename traits::iterator_category, std::bidirectional_iterator_tag>,
-            "Iterator category must be bidirectional_iterator_tag");
-
-        // Range value type check
-        static_assert(std::is_convertible_v<std::ranges::range_value_t<cb_type>, mutable_buffer>,
-            "Range value type must be convertible to mutable_buffer");
-
-        // Verify std::ranges::begin and std::ranges::end work
-        {
-            cb_type cb(bufs);
-            auto it1 = std::ranges::begin(cb);
-            auto it2 = std::ranges::end(cb);
-            BOOST_TEST(it1 != it2);
-        }
-
-        // Final check - Buffer Sequence Concept
-        static_assert(MutableBufferSequence<cb_type>,
-            "consuming_buffers must model mutable_buffer_sequence");
-    }
-
-    void
-    run()
-    {
-        testBufferSequenceConcept();
-        testSingleBuffer();
-        testRangeConcepts();
-    }
-};
-
-TEST_SUITE(consuming_buffers_test, "boost.capy.consuming_buffers");
-
-} // namespace capy
-} // namespace boost
diff --git a/test/unit/buffers/slice.cpp b/test/unit/buffers/slice.cpp
deleted file mode 100644
index 4a3bb28e1..000000000
--- a/test/unit/buffers/slice.cpp
+++ /dev/null
@@ -1,320 +0,0 @@
-//
-// Copyright (c) 2025 Vinnie Falco (vinnie.falco@gmail.com)
-//
-// Distributed under the Boost Software License, Version 1.0. (See accompanying
-// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
-//
-// Official repository: https://github.com/cppalliance/capy
-//
-
-// Test that header file is self-contained.
-#include <boost/capy/buffers/slice.hpp>
-
-#include <boost/capy/buffers/buffer_pair.hpp>
-#include <boost/capy/buffers/buffer_copy.hpp>
-#include <boost/capy/buffers/make_buffer.hpp>
-
-#include <array>
-#include <string_view>
-#include <vector>
-
-#include "test_buffers.hpp"
-#include "test_suite.hpp"
-
-namespace boost {
-namespace capy {
-
-template<
-    std::size_t I,
-    std::size_t N>
-void
-set(
-    std::string&,
-    std::array<const_buffer, N>&)
-{
-}
-
-template<
-    std::size_t I,
-    std::size_t N,
-    class... Args>
-void
-set(
-    std::string& s,
-    std::array<const_buffer, N>& v,
-    char const* p,
-    Args const&... args)
-{
-    std::string_view sv(p);
-    v[I] = make_buffer(sv);
-    s.append(sv.data(), sv.size());
-    set<I+1>(s, v, args...);
-}
-
-auto
-make_buffers(
-    std::string&) ->
-    std::array<const_buffer, 0>
-{
-    return {};
-}
-
-template<
-    class... Args>
-auto
-make_buffers(
-    std::string& s,
-    char const* arg0,
-    Args const&... args) ->
-    std::array<const_buffer, 1 + sizeof...(Args)>
-{
-    s = {};
-    std::array<const_buffer, 1 + sizeof...(Args)> v;
-    set<0>(s, v, arg0, args...);
-    return v;
-}
-
-struct slice_test
-{
-    static
-    void
-    checkStatic()
-    {
-        using T = slice_of<const_buffer_pair>;
-
-        static_assert(std::is_default_constructible<T>::value);
-        static_assert(std::is_copy_constructible<T>::value);
-        static_assert(std::is_move_constructible<T>::value);
-        static_assert(std::is_copy_assignable<T>::value);
-        static_assert(std::is_move_assignable<T>::value);
-
-        using U = T::const_iterator;
-
-        static_assert(std::is_default_constructible<U>::value);
-        static_assert(std::is_copy_constructible<U>::value);
-        static_assert(std::is_move_constructible<U>::value);
-        static_assert(std::is_copy_assignable<U>::value);
-        static_assert(std::is_move_assignable<U>::value);
-    }
-
-    template<class B>
-    static
-    void
-    check(
-        B const& b,
-        std::string_view s)
-    {
-        auto constexpr M = 1024;
-        char buf[M];
-        if(! BOOST_TEST_LE(buffer_size(b), M))
-            return;
-        if(! BOOST_TEST_EQ(buffer_size(b), s.size()))
-            return;
-        auto const n = buffer_copy(
-            mutable_buffer(buf, M), b);
-        if(! BOOST_TEST_EQ(n, s.size()))
-            return;
-        if(! BOOST_TEST_EQ(std::string_view(buf, n), s))
-            return;
-
-        std::string tmp;
-        test::check_iterators(b, s, tmp);
-    }
-
-    // Use a vector so that iterator invalidation is observable during testing.
-    using seq_type = std::vector<const_buffer>;
-
-    void
-    grind_back(
-        slice_of<seq_type> const& bs0,
-        std::string_view pat0)
-    {
-        auto const n = buffer_size(bs0);
-        if(! BOOST_TEST_EQ(n, pat0.size()))
-            return;
-        for(std::size_t i = 0; i < n; ++i)
-        {
-            auto bs = bs0;
-            auto pat = pat0.substr(0, pat0.size() - i);
-            remove_suffix(bs, i);
-            check(bs, pat);
-        }
-        // n >= buffer_size
-        for(std::size_t i = 0; i < 2; ++i)
-        {
-            auto bs = bs0;
-            remove_suffix(bs, n + i);
-            BOOST_TEST_EQ(buffer_size(bs), 0);
-            check(bs, "");
-        }
-    }
-
-    void
-    grind(
-        slice_of<seq_type> const& bs0,
-        std::string_view pat0)
-    {
-        auto const n = buffer_size(bs0);
-        if(! BOOST_TEST_EQ(n, pat0.size()))
-            return;
-        for(std::size_t i = 0; i < n; ++i)
-        {
-            auto bs = bs0;
-            auto pat = pat0.substr(i);
-            remove_prefix(bs, i);
-            check(bs, pat);
-            grind_back(bs, pat);
-        }
-        // n >= buffer_size
-        for(std::size_t i = 0; i < 2; ++i)
-        {
-            auto bs = bs0;
-            remove_prefix(bs, n + i);
-            BOOST_TEST_EQ(buffer_size(bs), 0);
-            check(bs, "");
-        }
-    }
-
-    void
-    testSansPrefixSingleBuffer()
-    {
-        // Test sans_prefix with a single mutable_buffer
-        {
-            char data[] = "0123456789";
-            mutable_buffer buf(data, 10);
-
-            // sans_prefix(buf, 0) should return the full buffer
-            auto s0 = sans_prefix(buf, 0);
-            BOOST_TEST_EQ(buffer_size(s0), 10u);
-
-            // sans_prefix(buf, 3) should skip first 3 bytes
-            auto s3 = sans_prefix(buf, 3);
-            BOOST_TEST_EQ(buffer_size(s3), 7u);
-            BOOST_TEST_EQ(
-                static_cast<char const*>(
-                    const_buffer(s3).data())[0], '3');
-
-            // sans_prefix(buf, 10) should be empty
-            auto s10 = sans_prefix(buf, 10);
-            BOOST_TEST_EQ(buffer_size(s10), 0u);
-
-            // sans_prefix(buf, 100) should be empty
-            auto s100 = sans_prefix(buf, 100);
-            BOOST_TEST_EQ(buffer_size(s100), 0u);
-        }
-
-        // Test sans_prefix with a single const_buffer
-        {
-            char data[] = "Hello World";
-            const_buffer buf(data, 11);
-
-            auto s0 = sans_prefix(buf, 0);
-            BOOST_TEST_EQ(buffer_size(s0), 11u);
-
-            auto s6 = sans_prefix(buf, 6);
-            BOOST_TEST_EQ(buffer_size(s6), 5u);
-            BOOST_TEST_EQ(
-                static_cast<char const*>(s6.data())[0], 'W');
-        }
-    }
-
-    void
-    testSansPrefixBufferSequence()
-    {
-        // Test sans_prefix with a vector of buffers
-        std::string s1 = "ABCD";
-        std::string s2 = "EFGH";
-        std::string s3 = "IJKL";
-
-        std::vector<const_buffer> bufs = {
-            const_buffer(s1.data(), s1.size()),
-            const_buffer(s2.data(), s2.size()),
-            const_buffer(s3.data(), s3.size())
-        };
-
-        // sans_prefix removing nothing
-        {
-            auto result = sans_prefix(bufs, 0);
-            BOOST_TEST_EQ(buffer_size(result), 12u);
-        }
-
-        // sans_prefix removing 2 bytes (within first buffer)
-        {
-            auto result = sans_prefix(bufs, 2);
-            BOOST_TEST_EQ(buffer_size(result), 10u);
-        }
-
-        // sans_prefix removing 5 bytes (crosses buffer boundary)
-        {
-            auto result = sans_prefix(bufs, 5);
-            BOOST_TEST_EQ(buffer_size(result), 7u);
-        }
-
-        // sans_prefix removing all
-        {
-            auto result = sans_prefix(bufs, 12);
-            BOOST_TEST_EQ(buffer_size(result), 0u);
-        }
-    }
-
-    void
-    testBufferEmptyWithSlice()
-    {
-        // Verify buffer_empty works correctly with sliced buffers
-        {
-            char data[] = "test";
-            mutable_buffer buf(data, 4);
-
-            auto s0 = sans_prefix(buf, 0);
-            BOOST_TEST(!buffer_empty(s0));
-
-            auto s4 = sans_prefix(buf, 4);
-            BOOST_TEST(buffer_empty(s4));
-        }
-    }
-
-    void
-    testSansPrefixLoop()
-    {
-        // Test the pattern used in any_buffer_source::read()
-        char data[10] = {};
-        mutable_buffer buf(data, 10);
-
-        auto dest = sans_prefix(buf, 0);
-        BOOST_TEST_EQ(buffer_size(dest), 10u);
-        BOOST_TEST(!buffer_empty(dest));
-
-        // Simulate consuming 2 bytes
-        dest = sans_prefix(dest, 2);
-        BOOST_TEST_EQ(buffer_size(dest), 8u);
-        BOOST_TEST(!buffer_empty(dest));
-
-        // Consume remaining
-        dest = sans_prefix(dest, 8);
-        BOOST_TEST_EQ(buffer_size(dest), 0u);
-        BOOST_TEST(buffer_empty(dest));
-    }
-
-    void
-    run()
-    {
-        std::string s;
-        auto a = make_buffers(s, "boost.", "buffers.", "slice_");
-        seq_type bs(a.begin(), a.end());
-        test::check_sequence(bs, s, true);
-        //check(bs, s);
-        //grind(bs, s);
-
-        testSansPrefixSingleBuffer();
-        testSansPrefixBufferSequence();
-        testBufferEmptyWithSlice();
-        testSansPrefixLoop();
-    }
-};
-
-TEST_SUITE(
-    slice_test,
-    "boost.capy.buffers.slice");
-
-} // capy
-} // boost
diff --git a/test/unit/buffers/test_buffers.hpp b/test/unit/buffers/test_buffers.hpp
index 33261bca1..ff1f25b03 100644
--- a/test/unit/buffers/test_buffers.hpp
+++ b/test/unit/buffers/test_buffers.hpp
@@ -11,8 +11,8 @@
 #define BOOST_CAPY_BUFFERS_TEST_BUFFERS_HPP
 
 #include <boost/capy/buffers/buffer_copy.hpp>
+#include <boost/capy/buffers/buffer_slice.hpp>
 #include <boost/capy/buffers/make_buffer.hpp>
-#include <boost/capy/buffers/slice.hpp>
 #include <string>
 #include <string_view>
 
@@ -261,40 +261,39 @@ grind_front(
     bool deep)
 {
     std::string tmp;
+    std::size_t const total = buffer_size(bs0);
 
     for(std::size_t n = 0; n <= pat0.size() + 1; ++n)
     {
         {
+            // remove_prefix: drop the first n bytes
             auto pat = trimmed_front(pat0, n);
-            slice_type<ConstBufferSequence> bs(bs0);
-            remove_prefix(bs, n);
-            check_eq(bs, pat);
-            check_iterators(bs, pat, tmp);
+            auto bs = buffer_slice(bs0);
+            bs.remove_prefix(n);
+            check_eq(bs.data(), pat);
+            check_iterators(bs.data(), pat, tmp);
 
             if(deep)
             {
-                // Take a copy, blank out the original to invalidate any
-                // iterators, and redo the test
-                slice_type<ConstBufferSequence> bsc(bs);
-                {
-                    slice_type<ConstBufferSequence> dummy{};
-                    std::swap(bs, dummy);
-                }
+                // Take a copy, blank out the original, and redo the test
+                auto bsc = bs;
+                bs = decltype(bs){};
                 for(std::size_t m = 0; m <= pat.size() + 1; ++m)
                 {
                     auto pat2 = trimmed_front(pat, m);
-                    slice_type<ConstBufferSequence> bs2(bsc);
-                    remove_prefix(bs2, m);
-                    check_eq(bs2, pat2);
+                    auto bs2 = bsc;
+                    bs2.remove_prefix(m);
+                    check_eq(bs2.data(), pat2);
                 }
             }
         }
         {
+            // keep_prefix: keep only the first n bytes
             auto pat = kept_front(pat0, n);
-            slice_type<ConstBufferSequence> bs(bs0);
-            keep_prefix(bs, n);
-            check_eq(bs, pat);
-            check_iterators(bs, pat, tmp);
+            std::size_t const len = (n < total) ? n : total;
+            auto bs = buffer_slice(bs0, 0, len);
+            check_eq(bs.data(), pat);
+            check_iterators(bs.data(), pat, tmp);
         }
     }
 }
@@ -307,39 +306,47 @@ grind_back(
     bool deep)
 {
     std::string tmp;
+    std::size_t const total = buffer_size(bs0);
 
     for(std::size_t n = 0; n <= pat0.size() + 1; ++n)
     {
         {
+            // remove_suffix: drop the last n bytes
             auto pat = trimmed_back(pat0, n);
-            slice_type<ConstBufferSequence> bs(bs0);
-            remove_suffix(bs, n);
-            check_eq(bs, pat);
-            check_iterators(bs, pat, tmp);
+            std::size_t const len = (n < total) ? total - n : 0;
+            auto bs = buffer_slice(bs0, 0, len);
+            check_eq(bs.data(), pat);
+            check_iterators(bs.data(), pat, tmp);
             if(deep)
             {
-                // Take a copy, blank out the original to invalidate any
-                // iterators, and redo the test
-                slice_type<ConstBufferSequence> bsc(bs);
-                {
-                    slice_type<ConstBufferSequence> dummy{};
-                    std::swap(bs, dummy);
-                }
+                // Take a copy, blank out the original, and redo the test
+                auto bsc = bs;
+                bs = decltype(bs){};
                 for(std::size_t m = 0; m <= pat.size() + 1; ++m)
                 {
                     auto pat2 = trimmed_back(pat, m);
-                    slice_type<ConstBufferSequence> bs2(bsc);
-                    remove_suffix(bs2, m);
-                    check_eq(bs2, pat2);
+                    // Drop another m bytes from the back of bsc by
+                    // length-capping a fresh slice of the same data.
+                    std::size_t const len2 = buffer_size(bsc.data());
+                    std::size_t const new_len =
+                        (m < len2) ? len2 - m : 0;
+                    auto bs2 = bsc;
+                    // Walk forward (current state) and use remove_prefix
+                    // to drop the front; for the back we need a fresh
+                    // slice over the inner-window. Easiest: construct
+                    // a new slice from the original at the right offset/length.
+                    bs2 = buffer_slice(bs0, 0, new_len);
+                    check_eq(bs2.data(), pat2);
                 }
             }
         }
         {
+            // keep_suffix: keep only the last n bytes
             auto pat = kept_back(pat0, n);
-            slice_type<ConstBufferSequence> bs(bs0);
-            keep_suffix(bs, n);
-            check_eq(bs, pat);
-            check_iterators(bs, pat, tmp);
+            std::size_t const offset = (n < total) ? total - n : 0;
+            auto bs = buffer_slice(bs0, offset);
+            check_eq(bs.data(), pat);
+            check_iterators(bs.data(), pat, tmp);
         }
     }
 }
diff --git a/test/unit/delay.cpp b/test/unit/delay.cpp
new file mode 100644
index 000000000..cc53d6cd4
--- /dev/null
+++ b/test/unit/delay.cpp
@@ -0,0 +1,288 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+// Test that header file is self-contained.
+#include <boost/capy/delay.hpp>
+
+#include <boost/capy/ex/run_async.hpp>
+#include <boost/capy/ex/thread_pool.hpp>
+#include <boost/capy/task.hpp>
+
+#include "test_helpers.hpp"
+#include "test_suite.hpp"
+
+#include <latch>
+#include <memory_resource>
+#include <stop_token>
+
+namespace boost {
+namespace capy {
+
+using namespace std::chrono_literals;
+
+struct delay_test
+{
+    // Test: delay completes after duration
+    void
+    testDelayCompletes()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        bool completed = false;
+
+        auto delay_task = [&]() -> task<void>
+        {
+            (void) co_await delay(10ms);
+            completed = true;
+        };
+
+        run_async(pool.get_executor(),
+            [&]() {
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(delay_task());
+
+        done.wait();
+        BOOST_TEST(completed);
+    }
+
+    // Test: delay waits at least the specified duration
+    void
+    testDelayMinimumDuration()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+
+        auto delay_task = [&]() -> task<void>
+        {
+            (void) co_await delay(50ms);
+        };
+
+        auto start = std::chrono::steady_clock::now();
+
+        run_async(pool.get_executor(),
+            [&]() {
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(delay_task());
+
+        done.wait();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        BOOST_TEST(elapsed >= 50ms);
+    }
+
+    // Test: stop requested before delay suspends (early-out path)
+    void
+    testDelayCancellationEarlyOut()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        std::stop_source source;
+
+        auto delay_task = [&]() -> task<void>
+        {
+            (void) co_await delay(10s);
+        };
+
+        auto start = std::chrono::steady_clock::now();
+
+        run_async(pool.get_executor(), source.get_token(),
+            [&]() {
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(delay_task());
+
+        // Cancel immediately — likely before delay suspends
+        source.request_stop();
+
+        done.wait();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        BOOST_TEST(elapsed < 1s);
+    }
+
+    // Test: stop requested after delay is fully suspended
+    //       (exercises cancel_fn stop callback path)
+    void
+    testDelayCancellationWhileSuspended()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        std::latch suspended(1);
+        std::stop_source source;
+
+        auto delay_task = [&]() -> task<void>
+        {
+            // Signal that we're about to suspend on delay
+            suspended.count_down();
+            (void) co_await delay(10s);
+        };
+
+        auto start = std::chrono::steady_clock::now();
+
+        run_async(pool.get_executor(), source.get_token(),
+            [&]() {
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(delay_task());
+
+        // Wait for the task to reach the delay point
+        suspended.wait();
+        // Small sleep to ensure delay_awaitable::await_suspend
+        // has fully completed (stop callback registered)
+        std::this_thread::sleep_for(10ms);
+        source.request_stop();
+
+        done.wait();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        BOOST_TEST(elapsed < 1s);
+    }
+
+    // Test: zero-duration delay completes immediately
+    void
+    testZeroDuration()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        bool completed = false;
+
+        auto delay_task = [&]() -> task<void>
+        {
+            (void) co_await delay(0ms);
+            completed = true;
+        };
+
+        run_async(pool.get_executor(),
+            [&]() {
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(delay_task());
+
+        done.wait();
+        BOOST_TEST(completed);
+    }
+
+    // Test: multiple sequential delays
+    void
+    testSequentialDelays()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        int step = 0;
+
+        auto delay_task = [&]() -> task<void>
+        {
+            (void) co_await delay(5ms);
+            step = 1;
+            (void) co_await delay(5ms);
+            step = 2;
+            (void) co_await delay(5ms);
+            step = 3;
+        };
+
+        run_async(pool.get_executor(),
+            [&]() {
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(delay_task());
+
+        done.wait();
+        BOOST_TEST_EQ(step, 3);
+    }
+
+    // Test: destroying delay_awaitable while suspended
+    //       cleans up both stop callback and timer
+    void
+    testDestroyWhileSuspended()
+    {
+// GCC emits a false -Wmaybe-uninitialized when it inlines
+// the stop_callback destructor through the alignas buffer.
+#if defined(__GNUC__) && !defined(__clang__)
+# pragma GCC diagnostic push
+# pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+        thread_pool pool(1);
+        auto ex = pool.get_executor();
+        std::stop_source source;
+        io_env env{ex, source.get_token(),
+            std::pmr::get_default_resource()};
+
+        {
+            delay_awaitable da(std::chrono::seconds(10));
+            // Manually suspend — registers timer and stop callback
+            da.await_suspend(std::noop_coroutine(), &env);
+            // da destroyed here without calling await_resume
+        }
+
+        // If cleanup was incomplete, requesting stop or waiting
+        // for the timer would access freed memory (UB/crash).
+        source.request_stop();
+        std::this_thread::sleep_for(20ms);
+        BOOST_TEST(true);
+#if defined(__GNUC__) && !defined(__clang__)
+# pragma GCC diagnostic pop
+#endif
+    }
+
+    // Test: concurrent delays on a multi-threaded pool
+    //       exercises use_service race and shared timer_service
+    void
+    testConcurrentDelays()
+    {
+        constexpr int N = 10;
+        thread_pool pool(4);
+        std::latch done(N);
+
+        auto delay_task = [](int i) -> task<void>
+        {
+            (void) co_await delay(10ms * i);
+        };
+
+        for(int i = 0; i < N; ++i)
+        {
+            run_async(pool.get_executor(),
+                [&]() { done.count_down(); },
+                [&](std::exception_ptr) {
+                    done.count_down();
+                })(delay_task(i));
+        }
+
+        done.wait();
+        BOOST_TEST(true);
+    }
+
+    void
+    run()
+    {
+        testDelayCompletes();
+        testDelayMinimumDuration();
+        testDelayCancellationEarlyOut();
+        testDelayCancellationWhileSuspended();
+        testZeroDuration();
+        testSequentialDelays();
+        testDestroyWhileSuspended();
+        testConcurrentDelays();
+    }
+};
+
+TEST_SUITE(delay_test, "capy.delay");
+
+} // capy
+} // boost
diff --git a/test/unit/buffers/buffer_array.cpp b/test/unit/detail/buffer_array.cpp
similarity index 78%
rename from test/unit/buffers/buffer_array.cpp
rename to test/unit/detail/buffer_array.cpp
index 33407de43..1b8091942 100644
--- a/test/unit/buffers/buffer_array.cpp
+++ b/test/unit/detail/buffer_array.cpp
@@ -8,16 +8,41 @@
 //
 
 // Test that header file is self-contained.
-#include <boost/capy/buffers/buffer_array.hpp>
+#include <boost/capy/detail/buffer_array.hpp>
 
-#include "test_buffers.hpp"
+#include "../buffers/test_buffers.hpp"
 
+#include <ranges>
 #include <span>
 #include <vector>
 
 namespace boost {
 namespace capy {
 
+// std::ranges concepts for detail::const_buffer_array
+
+static_assert(std::ranges::range<detail::const_buffer_array<4>>);
+static_assert(std::ranges::input_range<detail::const_buffer_array<4>>);
+static_assert(std::ranges::forward_range<detail::const_buffer_array<4>>);
+static_assert(std::ranges::bidirectional_range<detail::const_buffer_array<4>>);
+static_assert(std::ranges::random_access_range<detail::const_buffer_array<4>>);
+static_assert(std::ranges::contiguous_range<detail::const_buffer_array<4>>);
+
+static_assert(ConstBufferSequence<detail::const_buffer_array<4>>);
+static_assert(!MutableBufferSequence<detail::const_buffer_array<4>>);
+
+// std::ranges concepts for detail::mutable_buffer_array
+
+static_assert(std::ranges::range<detail::mutable_buffer_array<4>>);
+static_assert(std::ranges::input_range<detail::mutable_buffer_array<4>>);
+static_assert(std::ranges::forward_range<detail::mutable_buffer_array<4>>);
+static_assert(std::ranges::bidirectional_range<detail::mutable_buffer_array<4>>);
+static_assert(std::ranges::random_access_range<detail::mutable_buffer_array<4>>);
+static_assert(std::ranges::contiguous_range<detail::mutable_buffer_array<4>>);
+
+static_assert(ConstBufferSequence<detail::mutable_buffer_array<4>>);
+static_assert(MutableBufferSequence<detail::mutable_buffer_array<4>>);
+
 struct buffer_array_test
 {
     void
@@ -27,7 +52,7 @@ struct buffer_array_test
 
         // default constructor
         {
-            const_buffer_array<4> ba;
+            detail::const_buffer_array<4> ba;
             BOOST_TEST_EQ(ba.to_span().size(), 0);
             BOOST_TEST_EQ(buffer_size(ba), 0);
         }
@@ -35,7 +60,7 @@ struct buffer_array_test
         // single buffer constructor
         {
             const_buffer b(pat.data(), pat.size());
-            const_buffer_array<4> ba(b);
+            detail::const_buffer_array<4> ba(b);
             BOOST_TEST_EQ(ba.to_span().size(), 1);
             BOOST_TEST_EQ(buffer_size(ba), pat.size());
             BOOST_TEST_EQ(test::make_string(ba), pat);
@@ -44,7 +69,7 @@ struct buffer_array_test
         // empty buffer is skipped
         {
             const_buffer b(pat.data(), 0);
-            const_buffer_array<4> ba(b);
+            detail::const_buffer_array<4> ba(b);
             BOOST_TEST_EQ(ba.to_span().size(), 0);
             BOOST_TEST_EQ(buffer_size(ba), 0);
         }
@@ -55,7 +80,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            const_buffer_array<4> ba(v);
+            detail::const_buffer_array<4> ba(v);
             BOOST_TEST_EQ(ba.to_span().size(), 3);
             BOOST_TEST_EQ(buffer_size(ba), pat.size());
             BOOST_TEST_EQ(test::make_string(ba), pat);
@@ -64,8 +89,8 @@ struct buffer_array_test
         // copy constructor
         {
             const_buffer b(pat.data(), pat.size());
-            const_buffer_array<4> ba1(b);
-            const_buffer_array<4> ba2(ba1);
+            detail::const_buffer_array<4> ba1(b);
+            detail::const_buffer_array<4> ba2(ba1);
             BOOST_TEST_EQ(ba2.to_span().size(), 1);
             BOOST_TEST_EQ(buffer_size(ba2), pat.size());
             BOOST_TEST_EQ(test::make_string(ba2), pat);
@@ -74,8 +99,8 @@ struct buffer_array_test
         // copy assignment
         {
             const_buffer b(pat.data(), pat.size());
-            const_buffer_array<4> ba1(b);
-            const_buffer_array<4> ba2;
+            detail::const_buffer_array<4> ba1(b);
+            detail::const_buffer_array<4> ba2;
             ba2 = ba1;
             BOOST_TEST_EQ(ba2.to_span().size(), 1);
             BOOST_TEST_EQ(buffer_size(ba2), pat.size());
@@ -88,7 +113,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            const_buffer_array<2> ba;
+            detail::const_buffer_array<2> ba;
             ba = v;
             BOOST_TEST_EQ(ba.to_span().size(), 2);
             BOOST_TEST_EQ(buffer_size(ba), 8);
@@ -97,7 +122,7 @@ struct buffer_array_test
         // span conversion
         {
             const_buffer b(pat.data(), pat.size());
-            const_buffer_array<4> ba(b);
+            detail::const_buffer_array<4> ba(b);
             std::span<const_buffer const> sp = ba;
             BOOST_TEST_EQ(sp.size(), 1);
             BOOST_TEST_EQ(sp[0].data(), pat.data());
@@ -106,7 +131,7 @@ struct buffer_array_test
         // to_span
         {
             const_buffer b(pat.data(), pat.size());
-            const_buffer_array<4> ba(b);
+            detail::const_buffer_array<4> ba(b);
             auto sp = ba.to_span();
             BOOST_TEST_EQ(sp.size(), 1);
             BOOST_TEST_EQ(sp[0].data(), pat.data());
@@ -118,7 +143,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            const_buffer_array<2> ba(v);
+            detail::const_buffer_array<2> ba(v);
             BOOST_TEST_EQ(ba.to_span().size(), 2);
             BOOST_TEST_EQ(buffer_size(ba), 8);
         }
@@ -132,7 +157,7 @@ struct buffer_array_test
             bool threw = false;
             try
             {
-                const_buffer_array<2> ba(std::in_place, v);
+                detail::const_buffer_array<2> ba(std::in_place, v);
                 (void)ba;
             }
             catch(std::length_error const&)
@@ -148,7 +173,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            const_buffer_array<4> ba(std::in_place, v);
+            detail::const_buffer_array<4> ba(std::in_place, v);
             BOOST_TEST_EQ(ba.to_span().size(), 3);
             BOOST_TEST_EQ(buffer_size(ba), pat.size());
             BOOST_TEST_EQ(test::make_string(ba), pat);
@@ -160,7 +185,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            const_buffer_array<4> ba(v.begin(), v.end());
+            detail::const_buffer_array<4> ba(v.begin(), v.end());
             BOOST_TEST_EQ(ba.to_span().size(), 3);
             BOOST_TEST_EQ(buffer_size(ba), pat.size());
             BOOST_TEST_EQ(test::make_string(ba), pat);
@@ -172,7 +197,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            const_buffer_array<2> ba(v.begin(), v.end());
+            detail::const_buffer_array<2> ba(v.begin(), v.end());
             BOOST_TEST_EQ(ba.to_span().size(), 2);
             BOOST_TEST_EQ(buffer_size(ba), 8);
         }
@@ -180,7 +205,7 @@ struct buffer_array_test
         // iterator-pair empty range
         {
             std::vector<const_buffer> v;
-            const_buffer_array<4> ba(v.begin(), v.end());
+            detail::const_buffer_array<4> ba(v.begin(), v.end());
             BOOST_TEST_EQ(ba.to_span().size(), 0);
             BOOST_TEST_EQ(buffer_size(ba), 0);
         }
@@ -192,7 +217,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 0);
             v.emplace_back(pat.data() + 3, 5);
-            const_buffer_array<4> ba(v.begin(), v.end());
+            detail::const_buffer_array<4> ba(v.begin(), v.end());
             BOOST_TEST_EQ(ba.to_span().size(), 2);
             BOOST_TEST_EQ(buffer_size(ba), 8);
         }
@@ -206,7 +231,7 @@ struct buffer_array_test
             bool threw = false;
             try
             {
-                const_buffer_array<2> ba(
+                detail::const_buffer_array<2> ba(
                     std::in_place, v.begin(), v.end());
                 (void)ba;
             }
@@ -223,7 +248,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            const_buffer_array<4> ba(
+            detail::const_buffer_array<4> ba(
                 std::in_place, v.begin(), v.end());
             BOOST_TEST_EQ(ba.to_span().size(), 3);
             BOOST_TEST_EQ(buffer_size(ba), pat.size());
@@ -237,7 +262,7 @@ struct buffer_array_test
                 std::vector<const_buffer> v;
                 v.emplace_back(pat.data(), i);
                 v.emplace_back(pat.data() + i, pat.size() - i);
-                const_buffer_array<4> ba(v);
+                detail::const_buffer_array<4> ba(v);
                 test::check_sequence(ba, pat);
             }
         }
@@ -250,7 +275,7 @@ struct buffer_array_test
 
         // default constructor
         {
-            mutable_buffer_array<4> ba;
+            detail::mutable_buffer_array<4> ba;
             BOOST_TEST_EQ(ba.to_span().size(), 0);
             BOOST_TEST_EQ(buffer_size(ba), 0);
         }
@@ -258,7 +283,7 @@ struct buffer_array_test
         // single buffer constructor
         {
             mutable_buffer b(pat.data(), pat.size());
-            mutable_buffer_array<4> ba(b);
+            detail::mutable_buffer_array<4> ba(b);
             BOOST_TEST_EQ(ba.to_span().size(), 1);
             BOOST_TEST_EQ(buffer_size(ba), pat.size());
             BOOST_TEST_EQ(test::make_string(ba), pat);
@@ -270,7 +295,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            mutable_buffer_array<4> ba(v);
+            detail::mutable_buffer_array<4> ba(v);
             BOOST_TEST_EQ(ba.to_span().size(), 3);
             BOOST_TEST_EQ(buffer_size(ba), pat.size());
             BOOST_TEST_EQ(test::make_string(ba), pat);
@@ -279,8 +304,8 @@ struct buffer_array_test
         // copy constructor
         {
             mutable_buffer b(pat.data(), pat.size());
-            mutable_buffer_array<4> ba1(b);
-            mutable_buffer_array<4> ba2(ba1);
+            detail::mutable_buffer_array<4> ba1(b);
+            detail::mutable_buffer_array<4> ba2(ba1);
             BOOST_TEST_EQ(ba2.to_span().size(), 1);
             BOOST_TEST_EQ(buffer_size(ba2), pat.size());
             BOOST_TEST_EQ(test::make_string(ba2), pat);
@@ -289,8 +314,8 @@ struct buffer_array_test
         // copy assignment
         {
             mutable_buffer b(pat.data(), pat.size());
-            mutable_buffer_array<4> ba1(b);
-            mutable_buffer_array<4> ba2;
+            detail::mutable_buffer_array<4> ba1(b);
+            detail::mutable_buffer_array<4> ba2;
             ba2 = ba1;
             BOOST_TEST_EQ(ba2.to_span().size(), 1);
             BOOST_TEST_EQ(buffer_size(ba2), pat.size());
@@ -303,7 +328,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            mutable_buffer_array<2> ba;
+            detail::mutable_buffer_array<2> ba;
             ba = v;
             BOOST_TEST_EQ(ba.to_span().size(), 2);
             BOOST_TEST_EQ(buffer_size(ba), 8);
@@ -312,7 +337,7 @@ struct buffer_array_test
         // span conversion
         {
             mutable_buffer b(pat.data(), pat.size());
-            mutable_buffer_array<4> ba(b);
+            detail::mutable_buffer_array<4> ba(b);
             std::span<mutable_buffer> sp = ba;
             BOOST_TEST_EQ(sp.size(), 1);
             BOOST_TEST_EQ(sp[0].data(), pat.data());
@@ -321,7 +346,7 @@ struct buffer_array_test
         // to_span
         {
             mutable_buffer b(pat.data(), pat.size());
-            mutable_buffer_array<4> ba(b);
+            detail::mutable_buffer_array<4> ba(b);
             auto sp = ba.to_span();
             BOOST_TEST_EQ(sp.size(), 1);
             BOOST_TEST_EQ(sp[0].data(), pat.data());
@@ -333,7 +358,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            mutable_buffer_array<2> ba(v);
+            detail::mutable_buffer_array<2> ba(v);
             BOOST_TEST_EQ(ba.to_span().size(), 2);
             BOOST_TEST_EQ(buffer_size(ba), 8);
         }
@@ -347,7 +372,7 @@ struct buffer_array_test
             bool threw = false;
             try
             {
-                mutable_buffer_array<2> ba(std::in_place, v);
+                detail::mutable_buffer_array<2> ba(std::in_place, v);
                 (void)ba;
             }
             catch(std::length_error const&)
@@ -363,7 +388,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            mutable_buffer_array<4> ba(std::in_place, v);
+            detail::mutable_buffer_array<4> ba(std::in_place, v);
             BOOST_TEST_EQ(ba.to_span().size(), 3);
             BOOST_TEST_EQ(buffer_size(ba), pat.size());
             BOOST_TEST_EQ(test::make_string(ba), pat);
@@ -375,7 +400,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            mutable_buffer_array<4> ba(v.begin(), v.end());
+            detail::mutable_buffer_array<4> ba(v.begin(), v.end());
             BOOST_TEST_EQ(ba.to_span().size(), 3);
             BOOST_TEST_EQ(buffer_size(ba), pat.size());
             BOOST_TEST_EQ(test::make_string(ba), pat);
@@ -387,7 +412,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            mutable_buffer_array<2> ba(v.begin(), v.end());
+            detail::mutable_buffer_array<2> ba(v.begin(), v.end());
             BOOST_TEST_EQ(ba.to_span().size(), 2);
             BOOST_TEST_EQ(buffer_size(ba), 8);
         }
@@ -395,7 +420,7 @@ struct buffer_array_test
         // iterator-pair empty range
         {
             std::vector<mutable_buffer> v;
-            mutable_buffer_array<4> ba(v.begin(), v.end());
+            detail::mutable_buffer_array<4> ba(v.begin(), v.end());
             BOOST_TEST_EQ(ba.to_span().size(), 0);
             BOOST_TEST_EQ(buffer_size(ba), 0);
         }
@@ -407,7 +432,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 0);
             v.emplace_back(pat.data() + 3, 5);
-            mutable_buffer_array<4> ba(v.begin(), v.end());
+            detail::mutable_buffer_array<4> ba(v.begin(), v.end());
             BOOST_TEST_EQ(ba.to_span().size(), 2);
             BOOST_TEST_EQ(buffer_size(ba), 8);
         }
@@ -421,7 +446,7 @@ struct buffer_array_test
             bool threw = false;
             try
             {
-                mutable_buffer_array<2> ba(
+                detail::mutable_buffer_array<2> ba(
                     std::in_place, v.begin(), v.end());
                 (void)ba;
             }
@@ -438,7 +463,7 @@ struct buffer_array_test
             v.emplace_back(pat.data(), 3);
             v.emplace_back(pat.data() + 3, 5);
             v.emplace_back(pat.data() + 8, pat.size() - 8);
-            mutable_buffer_array<4> ba(
+            detail::mutable_buffer_array<4> ba(
                 std::in_place, v.begin(), v.end());
             BOOST_TEST_EQ(ba.to_span().size(), 3);
             BOOST_TEST_EQ(buffer_size(ba), pat.size());
@@ -452,7 +477,7 @@ struct buffer_array_test
                 std::vector<mutable_buffer> v;
                 v.emplace_back(pat.data(), i);
                 v.emplace_back(pat.data() + i, pat.size() - i);
-                mutable_buffer_array<4> ba(v);
+                detail::mutable_buffer_array<4> ba(v);
                 test::check_sequence(ba, pat);
             }
         }
diff --git a/test/unit/ex/any_executor.cpp b/test/unit/ex/any_executor.cpp
index 6dcf96dcd..4c059fc36 100644
--- a/test/unit/ex/any_executor.cpp
+++ b/test/unit/ex/any_executor.cpp
@@ -250,7 +250,8 @@ struct any_executor_test
 
         std::atomic<int> counter{0};
         auto coro = make_counter_coro(counter);
-        ex.dispatch(coro.handle());
+        continuation c{coro.handle()};
+        ex.dispatch(c);
         coro.release();
 
         BOOST_TEST(wait_for([&]{ return counter.load() >= 1; }));
@@ -266,7 +267,8 @@ struct any_executor_test
 
         std::atomic<int> counter{0};
         auto coro = make_counter_coro(counter);
-        ex.post(coro.handle());
+        continuation c{coro.handle()};
+        ex.post(c);
         coro.release();
 
         BOOST_TEST(wait_for([&]{ return counter.load() >= 1; }));
@@ -276,18 +278,34 @@ struct any_executor_test
     void
     testMultiplePost()
     {
+        std::atomic<int> counter{0};
+        constexpr int N = 10;
+
+        // continuations must outlive pool to avoid
+        // dangling pointers in the executor queue.
+        counter_coro coros[N] = {
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+        };
+        continuation conts[N] = {};
+
         thread_pool pool(2);
         auto executor = pool.get_executor();
         any_executor ex(executor);
 
-        std::atomic<int> counter{0};
-        constexpr int N = 10;
-
         for(int i = 0; i < N; ++i)
         {
-            auto coro = make_counter_coro(counter);
-            ex.post(coro.handle());
-            coro.release();
+            conts[i] = continuation{coros[i].handle()};
+            ex.post(conts[i]);
+            coros[i].release();
         }
 
         BOOST_TEST(wait_for([&]{ return counter.load() >= N; }));
@@ -297,11 +315,20 @@ struct any_executor_test
     void
     testSharedOwnership()
     {
+        std::atomic<int> counter{0};
+
+        // continuations must outlive pool to avoid
+        // dangling pointers in the executor queue.
+        auto coro1 = make_counter_coro(counter);
+        auto coro2 = make_counter_coro(counter);
+        auto coro3 = make_counter_coro(counter);
+        continuation c1{coro1.handle()};
+        continuation c2{coro2.handle()};
+        continuation c3{coro3.handle()};
+
         thread_pool pool(1);
         auto executor = pool.get_executor();
 
-        std::atomic<int> counter{0};
-
         // Create any_executor and make copies
         any_executor ex1(executor);
         any_executor ex2 = ex1;
@@ -312,21 +339,12 @@ struct any_executor_test
         BOOST_TEST(ex2 == ex3);
 
         // Post through different copies
-        {
-            auto coro = make_counter_coro(counter);
-            ex1.post(coro.handle());
-            coro.release();
-        }
-        {
-            auto coro = make_counter_coro(counter);
-            ex2.post(coro.handle());
-            coro.release();
-        }
-        {
-            auto coro = make_counter_coro(counter);
-            ex3.post(coro.handle());
-            coro.release();
-        }
+        ex1.post(c1);
+        coro1.release();
+        ex2.post(c2);
+        coro2.release();
+        ex3.post(c3);
+        coro3.release();
 
         BOOST_TEST(wait_for([&]{ return counter.load() >= 3; }));
         BOOST_TEST_EQ(counter.load(), 3);
diff --git a/test/unit/ex/async_event.cpp b/test/unit/ex/async_event.cpp
index b9ffffff1..ee04532a7 100644
--- a/test/unit/ex/async_event.cpp
+++ b/test/unit/ex/async_event.cpp
@@ -17,6 +17,7 @@
 #include <boost/capy/ex/run_async.hpp>
 #include <boost/capy/ex/thread_pool.hpp>
 #include <boost/capy/io_result.hpp>
+#include <boost/capy/io_task.hpp>
 #include <boost/capy/task.hpp>
 #include <boost/capy/when_all.hpp>
 
@@ -680,18 +681,18 @@ struct async_event_test
         h.destroy();
     }
 
-    static task<void>
+    static io_task<>
     set_event_task(async_event& evt)
     {
         evt.set();
-        co_return;
+        co_return io_result<>{};
     }
 
     static task<void>
     when_all_set_event_main(bool& finished)
     {
         async_event evt;
-        co_await when_all(evt.wait(), set_event_task(evt));
+        (void) co_await when_all(evt.wait(), set_event_task(evt));
         finished = true;
     }
 
diff --git a/test/unit/ex/async_mutex.cpp b/test/unit/ex/async_mutex.cpp
index fc5bfd3a9..1b83125be 100644
--- a/test/unit/ex/async_mutex.cpp
+++ b/test/unit/ex/async_mutex.cpp
@@ -20,6 +20,12 @@
 #include <queue>
 #include <stop_token>
 
+// GCC gives false positive -Wmaybe-uninitialized on structured bindings
+// via the tuple protocol inside coroutine frames.
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
+
 #include "test_helpers.hpp"
 
 namespace boost {
diff --git a/test/unit/ex/detail/timer_service.cpp b/test/unit/ex/detail/timer_service.cpp
new file mode 100644
index 000000000..9da009769
--- /dev/null
+++ b/test/unit/ex/detail/timer_service.cpp
@@ -0,0 +1,287 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+// Test that header file is self-contained.
+#include <boost/capy/ex/detail/timer_service.hpp>
+
+#include <boost/capy/ex/thread_pool.hpp>
+
+#include "test_helpers.hpp"
+
+#include <atomic>
+#include <latch>
+
+namespace boost {
+namespace capy {
+
+using namespace std::chrono_literals;
+
+struct timer_service_test
+{
+    // Test: timer fires after duration
+    void
+    testBasicFire()
+    {
+        thread_pool pool(1);
+        auto& ts = pool.get_executor().context()
+            .use_service<detail::timer_service>();
+
+        std::latch done(1);
+        bool fired = false;
+
+        ts.schedule_after(1ms, [&] {
+            fired = true;
+            done.count_down();
+        });
+
+        done.wait();
+        BOOST_TEST(fired);
+    }
+
+    // Test: cancel prevents callback from firing
+    void
+    testCancelBeforeFire()
+    {
+        thread_pool pool(1);
+        auto& ts = pool.get_executor().context()
+            .use_service<detail::timer_service>();
+
+        bool fired = false;
+
+        auto id = ts.schedule_after(1s, [&] {
+            fired = true;
+        });
+
+        ts.cancel(id);
+
+        // Give some time to confirm it doesn't fire
+        std::this_thread::sleep_for(20ms);
+        BOOST_TEST(!fired);
+    }
+
+    // Test: cancel on already-fired timer is safe
+    void
+    testCancelAfterFire()
+    {
+        thread_pool pool(1);
+        auto& ts = pool.get_executor().context()
+            .use_service<detail::timer_service>();
+
+        std::latch done(1);
+
+        auto id = ts.schedule_after(1ms, [&] {
+            done.count_down();
+        });
+
+        done.wait();
+        // Should not block or crash
+        ts.cancel(id);
+    }
+
+    // Test: multiple timers fire in deadline order
+    void
+    testFiringOrder()
+    {
+        thread_pool pool(1);
+        auto& ts = pool.get_executor().context()
+            .use_service<detail::timer_service>();
+
+        std::vector<int> order;
+        std::mutex mu;
+        std::latch done(3);
+
+        auto const scale = failsafe_scale;
+
+        ts.schedule_after(30ms * scale, [&] {
+            std::lock_guard lock(mu);
+            order.push_back(3);
+            done.count_down();
+        });
+        ts.schedule_after(10ms * scale, [&] {
+            std::lock_guard lock(mu);
+            order.push_back(1);
+            done.count_down();
+        });
+        ts.schedule_after(20ms * scale, [&] {
+            std::lock_guard lock(mu);
+            order.push_back(2);
+            done.count_down();
+        });
+
+        done.wait();
+        BOOST_TEST_EQ(order.size(), 3u);
+        BOOST_TEST_EQ(order[0], 1);
+        BOOST_TEST_EQ(order[1], 2);
+        BOOST_TEST_EQ(order[2], 3);
+    }
+
+    // Test: zero duration fires promptly
+    void
+    testZeroDuration()
+    {
+        thread_pool pool(1);
+        auto& ts = pool.get_executor().context()
+            .use_service<detail::timer_service>();
+
+        std::latch done(1);
+        auto start = std::chrono::steady_clock::now();
+
+        ts.schedule_after(0ms, [&] {
+            done.count_down();
+        });
+
+        done.wait();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        BOOST_TEST(elapsed < 100ms);
+    }
+
+    // Test: many timers scheduled concurrently
+    void
+    testManyConcurrent()
+    {
+        thread_pool pool(1);
+        auto& ts = pool.get_executor().context()
+            .use_service<detail::timer_service>();
+
+        constexpr int N = 100;
+        std::atomic<int> count{0};
+        std::latch done(N);
+
+        for(int i = 0; i < N; ++i)
+        {
+            ts.schedule_after(1ms, [&] {
+                count.fetch_add(1, std::memory_order_relaxed);
+                done.count_down();
+            });
+        }
+
+        done.wait();
+        BOOST_TEST_EQ(count.load(), N);
+    }
+
+    // Test: cancel subset of timers
+    void
+    testCancelSubset()
+    {
+        thread_pool pool(1);
+        auto& ts = pool.get_executor().context()
+            .use_service<detail::timer_service>();
+
+        std::atomic<int> count{0};
+        std::latch done(1);
+
+        auto id1 = ts.schedule_after(10ms, [&] {
+            count.fetch_add(1, std::memory_order_relaxed);
+        });
+        ts.schedule_after(10ms, [&] {
+            count.fetch_add(1, std::memory_order_relaxed);
+            done.count_down();
+        });
+        auto id3 = ts.schedule_after(10ms, [&] {
+            count.fetch_add(1, std::memory_order_relaxed);
+        });
+
+        ts.cancel(id1);
+        ts.cancel(id3);
+
+        // Wait for the uncancelled timer to fire
+        done.wait();
+        // Give time for any incorrectly-uncancelled timers
+        std::this_thread::sleep_for(20ms);
+        BOOST_TEST_EQ(count.load(), 1);
+    }
+
+    // Test: shutdown with pending timers doesn't crash
+    void
+    testShutdownWithPending()
+    {
+        {
+            thread_pool pool(1);
+            auto& ts = pool.get_executor().context()
+                .use_service<detail::timer_service>();
+
+            // Schedule timers far in the future
+            ts.schedule_after(10s, [] {});
+            ts.schedule_after(10s, [] {});
+            ts.schedule_after(10s, [] {});
+
+            // pool destructor calls shutdown — should not hang
+        }
+        BOOST_TEST(true);
+    }
+
+    // Test: timer fires at or after the specified duration
+    void
+    testFiresAtOrAfter()
+    {
+        thread_pool pool(1);
+        auto& ts = pool.get_executor().context()
+            .use_service<detail::timer_service>();
+
+        std::latch done(1);
+        auto start = std::chrono::steady_clock::now();
+        auto dur = 50ms;
+
+        ts.schedule_after(dur, [&] {
+            done.count_down();
+        });
+
+        done.wait();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        BOOST_TEST(elapsed >= dur);
+    }
+
+    // Test: cancel blocks while callback is executing
+    void
+    testCancelBlocksDuringExecution()
+    {
+        thread_pool pool(1);
+        auto& ts = pool.get_executor().context()
+            .use_service<detail::timer_service>();
+
+        std::atomic<bool> callback_started{false};
+        std::atomic<bool> callback_finished{false};
+        std::latch started(1);
+
+        auto id = ts.schedule_after(1ms, [&] {
+            callback_started.store(true);
+            started.count_down();
+            std::this_thread::sleep_for(50ms);
+            callback_finished.store(true);
+        });
+
+        // Wait for callback to start executing
+        started.wait();
+        BOOST_TEST(callback_started.load());
+
+        // cancel() must block until callback finishes
+        ts.cancel(id);
+        BOOST_TEST(callback_finished.load());
+    }
+
+    void
+    run()
+    {
+        testBasicFire();
+        testCancelBeforeFire();
+        testCancelAfterFire();
+        testFiringOrder();
+        testZeroDuration();
+        testManyConcurrent();
+        testCancelSubset();
+        testShutdownWithPending();
+        testFiresAtOrAfter();
+        testCancelBlocksDuringExecution();
+    }
+};
+
+TEST_SUITE(timer_service_test, "capy.ex.timer_service");
+
+} // capy
+} // boost
diff --git a/test/unit/ex/executor_ref.cpp b/test/unit/ex/executor_ref.cpp
index ae93e4d27..f01360ac3 100644
--- a/test/unit/ex/executor_ref.cpp
+++ b/test/unit/ex/executor_ref.cpp
@@ -192,7 +192,8 @@ struct executor_ref_test
 
         std::atomic<int> counter{0};
         auto coro = make_counter_coro(counter);
-        ex.dispatch(coro.handle());
+        continuation c{coro.handle()};
+        ex.dispatch(c);
         coro.release();
 
         BOOST_TEST(wait_for([&]{ return counter.load() >= 1; }));
@@ -208,7 +209,8 @@ struct executor_ref_test
 
         std::atomic<int> counter{0};
         auto coro = make_counter_coro(counter);
-        ex.post(coro.handle());
+        continuation c{coro.handle()};
+        ex.post(c);
         coro.release();
 
         BOOST_TEST(wait_for([&]{ return counter.load() >= 1; }));
@@ -218,18 +220,34 @@ struct executor_ref_test
     void
     testMultiplePost()
     {
+        std::atomic<int> counter{0};
+        constexpr int N = 10;
+
+        // continuations must outlive pool to avoid
+        // dangling pointers in the executor queue.
+        counter_coro coros[N] = {
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+            make_counter_coro(counter),
+        };
+        continuation conts[N] = {};
+
         thread_pool pool(2);
         auto executor = pool.get_executor();
         executor_ref ex(executor);
 
-        std::atomic<int> counter{0};
-        constexpr int N = 10;
-
         for(int i = 0; i < N; ++i)
         {
-            auto coro = make_counter_coro(counter);
-            ex.post(coro.handle());
-            coro.release();
+            conts[i] = continuation{coros[i].handle()};
+            ex.post(conts[i]);
+            coros[i].release();
         }
 
         BOOST_TEST(wait_for([&]{ return counter.load() >= N; }));
diff --git a/test/unit/ex/frame_allocator.cpp b/test/unit/ex/frame_allocator.cpp
index c650accf5..93c71123a 100644
--- a/test/unit/ex/frame_allocator.cpp
+++ b/test/unit/ex/frame_allocator.cpp
@@ -78,6 +78,14 @@ TLS restoration on resume:
     After awaiting a child, the parent's TLS may have been changed by the child.
     transform_awaiter::await_resume restores parent's allocator from its promise.
 
+Event loops must use safe_resume:
+    Between a coroutine's await_resume (which sets TLS) and the next child
+    invocation (whose operator new reads TLS), arbitrary user code runs. If
+    that code resumes a coroutine from a different chain on the same thread,
+    the other coroutine's await_resume overwrites TLS. Event loops, strand
+    dispatch loops, and any code that calls .resume() must use safe_resume()
+    to save and restore TLS around the call.
+
 memory_resource* lifetime:
     When passing memory_resource* directly, the user is responsible for ensuring
     it outlives all tasks. This matches std::pmr conventions.
diff --git a/test/unit/ex/frame_cb.cpp b/test/unit/ex/frame_cb.cpp
new file mode 100644
index 000000000..60c8aa16e
--- /dev/null
+++ b/test/unit/ex/frame_cb.cpp
@@ -0,0 +1,201 @@
+//
+// Copyright (c) 2026 Vinnie Falco (vinnie.falco@gmail.com)
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#include <boost/capy/concept/io_awaitable.hpp>
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/detail/await_suspend_helper.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/ex/run_async.hpp>
+#include <boost/capy/ex/thread_pool.hpp>
+#include <boost/capy/io_result.hpp>
+#include <boost/capy/task.hpp>
+#include <boost/capy/test/run_blocking.hpp>
+
+#include "test/unit/test_helpers.hpp"
+
+#include <coroutine>
+#include <latch>
+#include <memory>
+
+namespace boost {
+namespace capy {
+
+namespace detail {
+
+struct frame_cb
+{
+    void (*resume)(frame_cb*);
+    void (*destroy)(frame_cb*);
+    void* data;
+};
+
+} // detail
+
+struct frame_cb_test
+{
+    void
+    testResumeCallsFunctionPointer()
+    {
+        bool called = false;
+        detail::frame_cb cb;
+        cb.resume = +[](detail::frame_cb* p) {
+            *static_cast<bool*>(p->data) = true;
+        };
+        cb.destroy = +[](detail::frame_cb*) {};
+        cb.data = &called;
+
+        cb.resume(&cb);
+        BOOST_TEST(called);
+    }
+
+    void
+    testDestroyIsNoOp()
+    {
+        bool destroy_called = false;
+        detail::frame_cb cb;
+        cb.resume = +[](detail::frame_cb*) {};
+        cb.destroy = +[](detail::frame_cb* p) {
+            *static_cast<bool*>(p->data) = true;
+        };
+        cb.data = &destroy_called;
+
+        cb.destroy(&cb);
+        BOOST_TEST(destroy_called);
+    }
+
+    void
+    testDataPointerPassedThrough()
+    {
+        int value = 0;
+        detail::frame_cb cb;
+        cb.resume = +[](detail::frame_cb* p) {
+            *static_cast<int*>(p->data) = 42;
+        };
+        cb.destroy = +[](detail::frame_cb*) {};
+        cb.data = &value;
+
+        cb.resume(&cb);
+        BOOST_TEST_EQ(value, 42);
+    }
+
+    // IoAwaitable that resumes synchronously and returns a value
+    struct sync_awaitable
+    {
+        int value;
+
+        bool await_ready() const noexcept
+        {
+            return false;
+        }
+
+        std::coroutine_handle<>
+        await_suspend(
+            std::coroutine_handle<> h,
+            io_env const*) noexcept
+        {
+            return h;
+        }
+
+        io_result<int> await_resume() noexcept
+        {
+            return {std::error_code{}, value};
+        }
+    };
+
+    static_assert(IoAwaitable<sync_awaitable>);
+
+    static task<int>
+    await_sync(int v)
+    {
+        auto [ec, result] = co_await sync_awaitable{v};
+        co_return result;
+    }
+
+    void
+    testWithIoAwaitable()
+    {
+        int result = 0;
+        test::run_blocking(
+            [&](int v) { result = v; })(
+            await_sync(99));
+        BOOST_TEST_EQ(result, 99);
+    }
+
+    // IoAwaitable that posts to executor (async resume)
+    struct async_awaitable
+    {
+        int value;
+        continuation cont_;
+
+        bool await_ready() const noexcept
+        {
+            return false;
+        }
+
+        std::coroutine_handle<>
+        await_suspend(
+            std::coroutine_handle<> h,
+            io_env const* env) noexcept
+        {
+            cont_.h = h;
+            env->executor.post(cont_);
+            return std::noop_coroutine();
+        }
+
+        io_result<int> await_resume() noexcept
+        {
+            return {std::error_code{}, value};
+        }
+    };
+
+    static_assert(IoAwaitable<async_awaitable>);
+
+    static task<int>
+    await_async(int v)
+    {
+        auto [ec, result] = co_await async_awaitable{v, {}};
+        co_return result;
+    }
+
+    void
+    testWithAsyncAwaitable()
+    {
+        auto pool = std::make_unique<thread_pool>(1);
+        auto ex = pool->get_executor();
+        std::latch done(1);
+        int result = 0;
+
+        run_async(ex,
+            [&](int v) {
+                result = v;
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(await_async(99));
+
+        done.wait();
+        BOOST_TEST_EQ(result, 99);
+    }
+
+    void
+    run()
+    {
+        testResumeCallsFunctionPointer();
+        testDestroyIsNoOp();
+        testDataPointerPassedThrough();
+        testWithIoAwaitable();
+        testWithAsyncAwaitable();
+    }
+};
+
+TEST_SUITE(frame_cb_test, "capy.frame_cb");
+
+} // capy
+} // boost
diff --git a/test/unit/ex/immediate.cpp b/test/unit/ex/immediate.cpp
index 688ed96b4..fec1826b9 100644
--- a/test/unit/ex/immediate.cpp
+++ b/test/unit/ex/immediate.cpp
@@ -77,7 +77,7 @@ struct immediate_test
             immediate<io_result<std::size_t>> im{{{}, 42}};
             auto r = im.await_resume();
             BOOST_TEST(!r.ec);
-            BOOST_TEST_EQ(r.t1, 42u);
+            BOOST_TEST_EQ(std::get<0>(r.values), 42u);
         }
     }
 
@@ -102,7 +102,7 @@ struct immediate_test
             io_result<std::size_t> result{};
             test::run_blocking([&](io_result<std::size_t> v) { result = v; })(coro());
             BOOST_TEST(!result.ec);
-            BOOST_TEST_EQ(result.t1, 100u);
+            BOOST_TEST_EQ(std::get<0>(result.values), 100u);
         }
 
         // Structured binding with co_await
@@ -151,7 +151,7 @@ struct immediate_test
             BOOST_TEST(im.await_ready());
             auto r = im.await_resume();
             BOOST_TEST(!r.ec);
-            BOOST_TEST_EQ(r.t1, 42u);
+            BOOST_TEST_EQ(std::get<0>(r.values), 42u);
         }
 
         // co_await ready(n)
@@ -177,8 +177,8 @@ struct immediate_test
             BOOST_TEST(im.await_ready());
             auto r = im.await_resume();
             BOOST_TEST(!r.ec);
-            BOOST_TEST_EQ(r.t1, 42);
-            BOOST_TEST_EQ(r.t2, 3.14);
+            BOOST_TEST_EQ(std::get<0>(r.values), 42);
+            BOOST_TEST_EQ(std::get<1>(r.values), 3.14);
         }
 
         // co_await ready(a, b)
@@ -204,9 +204,9 @@ struct immediate_test
             BOOST_TEST(im.await_ready());
             auto r = im.await_resume();
             BOOST_TEST(!r.ec);
-            BOOST_TEST_EQ(r.t1, 1);
-            BOOST_TEST_EQ(r.t2, 2);
-            BOOST_TEST_EQ(r.t3, 3);
+            BOOST_TEST_EQ(std::get<0>(r.values), 1);
+            BOOST_TEST_EQ(std::get<1>(r.values), 2);
+            BOOST_TEST_EQ(std::get<2>(r.values), 3);
         }
 
         // co_await ready(a, b, c)
@@ -242,7 +242,7 @@ struct immediate_test
             BOOST_TEST(im.await_ready());
             auto r = im.await_resume();
             BOOST_TEST(r.ec);
-            BOOST_TEST_EQ(r.t1, 0u);
+            BOOST_TEST_EQ(std::get<0>(r.values), 0u);
         }
 
         // ready(ec, T1, T2) creates failed two-value result
diff --git a/test/unit/ex/priority_executor.hpp b/test/unit/ex/priority_executor.hpp
new file mode 100644
index 000000000..cdbfb9fc0
--- /dev/null
+++ b/test/unit/ex/priority_executor.hpp
@@ -0,0 +1,241 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+#ifndef BOOST_CAPY_TEST_UNIT_EX_PRIORITY_EXECUTOR_HPP
+#define BOOST_CAPY_TEST_UNIT_EX_PRIORITY_EXECUTOR_HPP
+
+#include <boost/capy/concept/executor.hpp>
+#include <boost/capy/continuation.hpp>
+#include <boost/capy/ex/execution_context.hpp>
+#include <boost/capy/ex/executor_ref.hpp>
+#include <boost/capy/ex/frame_allocator.hpp>
+
+#include <atomic>
+#include <coroutine>
+#include <exception>
+#include <mutex>
+#include <thread>
+#include <type_traits>
+#include <utility>
+
+namespace boost {
+namespace capy {
+namespace test {
+
+/** Test-only strand-shaped executor that drains high before low.
+*/
+struct priority_executor_state
+{
+    std::mutex mutex;
+    continuation* high_head = nullptr;
+    continuation* high_tail = nullptr;
+    continuation* low_head = nullptr;
+    continuation* low_tail = nullptr;
+    bool locked = false;
+    std::atomic<std::thread::id> dispatch_thread{};
+};
+
+namespace detail {
+
+struct priority_invoker
+{
+    struct promise_type
+    {
+        continuation self;
+
+        priority_invoker get_return_object() noexcept
+        {
+            return {std::coroutine_handle<promise_type>::from_promise(*this)};
+        }
+
+        std::suspend_always initial_suspend() noexcept { return {}; }
+        std::suspend_never final_suspend() noexcept { return {}; }
+        void return_void() noexcept {}
+        void unhandled_exception() { std::terminate(); }
+    };
+
+    std::coroutine_handle<promise_type> h_;
+};
+
+inline void
+drain_list(continuation* head) noexcept
+{
+    while(head)
+    {
+        continuation* c = head;
+        head = c->next;
+        c->next = nullptr;
+        ::boost::capy::safe_resume(c->h);
+    }
+}
+
+inline priority_invoker
+make_priority_invoker(priority_executor_state* s)
+{
+    for(;;)
+    {
+        s->dispatch_thread.store(
+            std::this_thread::get_id(),
+            std::memory_order_release);
+
+        continuation* high_head;
+        continuation* low_head;
+        {
+            std::lock_guard<std::mutex> lk(s->mutex);
+            high_head = s->high_head;
+            low_head = s->low_head;
+            s->high_head = nullptr;
+            s->high_tail = nullptr;
+            s->low_head = nullptr;
+            s->low_tail = nullptr;
+        }
+
+        drain_list(high_head);
+        drain_list(low_head);
+
+        {
+            std::lock_guard<std::mutex> lk(s->mutex);
+            if(!s->high_head && !s->low_head)
+            {
+                s->locked = false;
+                s->dispatch_thread.store(
+                    std::thread::id{},
+                    std::memory_order_release);
+                co_return;
+            }
+        }
+    }
+}
+
+} // namespace detail
+
+/** Executor view over priority_executor_state. Dispatch has the same
+    thread-check fast path as strand; post defaults to the low queue.
+*/
+template<class Ex>
+class priority_executor
+{
+    priority_executor_state* state_;
+    Ex inner_ex_;
+
+    enum class priority { high, low };
+
+    void
+    enqueue_under_lock(continuation& c, priority p) const noexcept
+    {
+        c.next = nullptr;
+        if(p == priority::high)
+        {
+            if(state_->high_tail) state_->high_tail->next = &c;
+            else state_->high_head = &c;
+            state_->high_tail = &c;
+        }
+        else
+        {
+            if(state_->low_tail) state_->low_tail->next = &c;
+            else state_->low_head = &c;
+            state_->low_tail = &c;
+        }
+    }
+
+    void
+    post_with_priority(continuation& c, priority p) const
+    {
+        bool first;
+        {
+            std::lock_guard<std::mutex> lk(state_->mutex);
+            enqueue_under_lock(c, p);
+            first = !state_->locked;
+            if(first) state_->locked = true;
+        }
+        if(first)
+            post_invoker();
+    }
+
+    void
+    post_invoker() const
+    {
+        auto inv = detail::make_priority_invoker(state_);
+        auto& self = inv.h_.promise().self;
+        self.h = inv.h_;
+        self.next = nullptr;
+        inner_ex_.post(self);
+    }
+
+public:
+    priority_executor(priority_executor_state& state, Ex inner) noexcept(
+        std::is_nothrow_move_constructible_v<Ex>)
+        : state_(&state)
+        , inner_ex_(std::move(inner))
+    {
+    }
+
+    priority_executor(priority_executor const&) noexcept(
+        std::is_nothrow_copy_constructible_v<Ex>) = default;
+    priority_executor(priority_executor&&) noexcept(
+        std::is_nothrow_move_constructible_v<Ex>) = default;
+    priority_executor& operator=(priority_executor const&) = default;
+    priority_executor& operator=(priority_executor&&) noexcept(
+        std::is_nothrow_move_assignable_v<Ex>) = default;
+
+    bool
+    operator==(priority_executor const& other) const noexcept
+    {
+        return state_ == other.state_;
+    }
+
+    auto&
+    context() const noexcept
+    {
+        return inner_ex_.context();
+    }
+
+    void on_work_started() const noexcept { inner_ex_.on_work_started(); }
+    void on_work_finished() const noexcept { inner_ex_.on_work_finished(); }
+
+    bool
+    running_in_this_thread() const noexcept
+    {
+        return state_->dispatch_thread.load(std::memory_order_acquire)
+            == std::this_thread::get_id();
+    }
+
+    std::coroutine_handle<>
+    dispatch(continuation& c) const
+    {
+        if(running_in_this_thread())
+            return c.h;
+        post_with_priority(c, priority::low);
+        return std::noop_coroutine();
+    }
+
+    void
+    post(continuation& c) const
+    {
+        post_with_priority(c, priority::low);
+    }
+
+    void
+    post_high(continuation& c) const
+    {
+        post_with_priority(c, priority::high);
+    }
+
+    void
+    post_low(continuation& c) const
+    {
+        post_with_priority(c, priority::low);
+    }
+};
+
+} // namespace test
+} // namespace capy
+} // namespace boost
+
+#endif
diff --git a/test/unit/ex/run.cpp b/test/unit/ex/run.cpp
index 5fe17ccd9..5c67645ef 100644
--- a/test/unit/ex/run.cpp
+++ b/test/unit/ex/run.cpp
@@ -19,7 +19,13 @@
 #include "test/unit/custom_task.hpp"
 #include "test/unit/test_helpers.hpp"
 
+#include <boost/capy/ex/strand.hpp>
+#include <boost/capy/ex/thread_pool.hpp>
+#include <boost/capy/test/run_blocking.hpp>
+
+#include <latch>
 #include <memory>
+#include <thread>
 
 namespace boost {
 namespace capy {
@@ -337,6 +343,97 @@ struct run_test
         BOOST_TEST(called);
     }
 
+    //----------------------------------------------------------
+    // Stop Token Propagation
+    //----------------------------------------------------------
+
+    static task<bool>
+    check_stop_requested()
+    {
+        auto token = co_await this_coro::stop_token;
+        co_return token.stop_requested();
+    }
+
+    void
+    testStopTokenInheritance()
+    {
+        // Verify run(ex) inherits the caller's stop token
+        int dispatch_count = 0;
+        test_executor ex(1, dispatch_count);
+        std::stop_source source;
+        source.request_stop();
+        bool result = false;
+
+        auto outer = [&]() -> task<bool> {
+            // run(ex) with no explicit stop token should inherit
+            // the caller's token (which is stopped)
+            co_return co_await capy::run(ex)(check_stop_requested());
+        };
+
+        run_async(ex, source.get_token(),
+            [&](bool v) { result = v; })(outer());
+
+        BOOST_TEST(result);
+    }
+
+    void
+    testStopTokenOverrideInnerStopped()
+    {
+        // Stop the inner (override) token only.
+        // Inner task should see stopped; outer should not.
+        int dispatch_count = 0;
+        test_executor ex(1, dispatch_count);
+        std::stop_source caller_source;
+        std::stop_source override_source;
+        override_source.request_stop();
+
+        bool outer_stopped = true;
+        bool inner_stopped = false;
+
+        auto outer = [&]() -> task<void> {
+            auto token = co_await this_coro::stop_token;
+            outer_stopped = token.stop_requested();
+            inner_stopped = co_await capy::run(ex, override_source.get_token())(
+                check_stop_requested());
+        };
+
+        run_async(ex, caller_source.get_token())(outer());
+
+        BOOST_TEST(!outer_stopped);
+        BOOST_TEST(inner_stopped);
+    }
+
+    void
+    testStopTokenOverrideOuterStopped()
+    {
+        // Stop the outer (caller) token only.
+        // Outer task should see stopped; inner (override) should not.
+        int dispatch_count = 0;
+        test_executor ex(1, dispatch_count);
+        std::stop_source caller_source;
+        caller_source.request_stop();
+        std::stop_source override_source;
+
+        bool outer_stopped = false;
+        bool inner_stopped = true;
+
+        auto outer = [&]() -> task<void> {
+            auto token = co_await this_coro::stop_token;
+            outer_stopped = token.stop_requested();
+            inner_stopped = co_await capy::run(ex, override_source.get_token())(
+                check_stop_requested());
+        };
+
+        run_async(ex, caller_source.get_token())(outer());
+
+        BOOST_TEST(outer_stopped);
+        BOOST_TEST(!inner_stopped);
+    }
+
+    //----------------------------------------------------------
+    // Allocator Propagation
+    //----------------------------------------------------------
+
     void
     testAllocatorPropagation()
     {
@@ -377,6 +474,63 @@ struct run_test
         BOOST_TEST(result);
     }
 
+    void
+    testRunExStrandFirstInstruction()
+    {
+        // Verify that the first instructions of a task passed
+        // to run(strand) execute inside the strand's serialization,
+        // not inline on an unprotected thread.
+        thread_pool pool(2, "str-pool-");
+        strand s(pool.get_executor());
+        bool inside_strand = false;
+        std::latch done(1);
+
+        auto inner = [&]() -> task<void> {
+            inside_strand = s.running_in_this_thread();
+            co_return;
+        };
+
+        auto outer = [&]() -> task<void> {
+            co_await capy::run(s)(inner());
+        };
+
+        run_async(pool.get_executor(),
+            [&]() { done.count_down(); })(outer());
+        done.wait();
+
+        BOOST_TEST(inside_strand);
+        pool.join();
+    }
+
+    // co_await run(compute_exec)(...) from an io loop must return
+    // the caller to the io thread, not leave it on a compute worker.
+    void
+    testHopsBackToIoThread()
+    {
+        thread_pool compute_pool(2, "compute-");
+
+        std::thread::id io_tid = std::this_thread::get_id();
+        std::thread::id compute_tid{};
+        std::thread::id parent_tid_after_run{};
+
+        test::run_blocking()([&]() -> task<void> {
+            auto compute_exec = compute_pool.get_executor();
+
+            co_await capy::run(compute_exec)([&]() -> task<void> {
+                compute_tid = std::this_thread::get_id();
+                co_return;
+            }());
+
+            parent_tid_after_run = std::this_thread::get_id();
+        }());
+
+        BOOST_TEST(compute_tid != std::thread::id{});
+        BOOST_TEST(compute_tid != io_tid);
+        BOOST_TEST_EQ(parent_tid_after_run, io_tid);
+
+        compute_pool.join();
+    }
+
     void
     run()
     {
@@ -394,8 +548,13 @@ struct run_test
         testStopTokenWithAllocator();
         testVoidWithStopToken();
         testVoidWithMemoryResource();
+        testStopTokenInheritance();
+        testStopTokenOverrideInnerStopped();
+        testStopTokenOverrideOuterStopped();
         testAllocatorPropagation();
         testAllocatorPropagationThroughRun();
+        testRunExStrandFirstInstruction();
+        testHopsBackToIoThread();
     }
 };
 
diff --git a/test/unit/ex/run_async.cpp b/test/unit/ex/run_async.cpp
index 9a8a9003a..8816b0362 100644
--- a/test/unit/ex/run_async.cpp
+++ b/test/unit/ex/run_async.cpp
@@ -78,16 +78,16 @@ struct sync_executor
     void on_work_started() const noexcept {}
     void on_work_finished() const noexcept {}
 
-    std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const
+    std::coroutine_handle<> dispatch(continuation& c) const
     {
         if(dispatch_count_)
             ++(*dispatch_count_);
-        return h;
+        return c.h;
     }
 
-    void post(std::coroutine_handle<> h) const
+    void post(continuation& c) const
     {
-        h.resume();
+        c.h.resume();
     }
 };
 
@@ -120,15 +120,15 @@ struct queue_executor
     void on_work_started() const noexcept {}
     void on_work_finished() const noexcept {}
 
-    std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const
+    std::coroutine_handle<> dispatch(continuation& c) const
     {
-        queue_->push(h);
+        queue_->push(c.h);
         return std::noop_coroutine();
     }
 
-    void post(std::coroutine_handle<> h) const
+    void post(continuation& c) const
     {
-        queue_->push(h);
+        queue_->push(c.h);
     }
 };
 
@@ -358,6 +358,46 @@ struct run_async_test
         BOOST_TEST(result);
     }
 
+    void
+    testScopedCancellation()
+    {
+        // Three tasks on the same executor: one with a scoped stop token,
+        // two with the default (empty) token. Cancelling the scoped token
+        // should only affect that task, not the others.
+        std::queue<std::coroutine_handle<>> queue;
+        queue_executor d(queue);
+
+        bool default_1_stopped = true;
+        bool scoped_stopped = false;
+        bool default_2_stopped = true;
+
+        std::stop_source scoped_source;
+
+        run_async(d, [&](bool v) { default_1_stopped = v; })(
+            check_stop_requested());
+        run_async(d, scoped_source.get_token(),
+            [&](bool v) { scoped_stopped = v; })(
+            check_stop_requested());
+        run_async(d, [&](bool v) { default_2_stopped = v; })(
+            check_stop_requested());
+
+        BOOST_TEST_EQ(queue.size(), 3u);
+
+        // Cancel the scoped source before draining
+        scoped_source.request_stop();
+
+        while(!queue.empty())
+        {
+            auto h = queue.front();
+            queue.pop();
+            h.resume();
+        }
+
+        BOOST_TEST(!default_1_stopped);
+        BOOST_TEST(scoped_stopped);
+        BOOST_TEST(!default_2_stopped);
+    }
+
     //----------------------------------------------------------
     // Allocator Propagation
     //----------------------------------------------------------
@@ -641,6 +681,7 @@ struct run_async_test
         // Stop Token
         testStopTokenPropagation();
         testCancellationVisible();
+        testScopedCancellation();
 
         // Allocator Propagation
         testAllocatorPropagation();
diff --git a/test/unit/ex/run_priority.cpp b/test/unit/ex/run_priority.cpp
new file mode 100644
index 000000000..b890936fc
--- /dev/null
+++ b/test/unit/ex/run_priority.cpp
@@ -0,0 +1,242 @@
+//
+// Copyright (c) 2026 Steve Gerbino
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+// Tests that fail against the current run()'s use of dispatch at
+// cross-executor boundaries; pass once run() posts on both trips.
+
+#include <boost/capy/concept/executor.hpp>
+#include <boost/capy/ex/run.hpp>
+#include <boost/capy/ex/run_async.hpp>
+#include <boost/capy/ex/strand.hpp>
+#include <boost/capy/ex/thread_pool.hpp>
+#include <boost/capy/task.hpp>
+
+#include "priority_executor.hpp"
+#include "test/unit/test_helpers.hpp"
+#include "test_suite.hpp"
+
+#include <coroutine>
+#include <cstddef>
+#include <queue>
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace boost {
+namespace capy {
+
+static_assert(Executor<test::priority_executor<queuing_executor>>,
+    "priority_executor must satisfy Executor concept");
+
+namespace {
+
+// Bare coroutine that appends a message and ends. Posted directly.
+struct log_coro
+{
+    struct promise_type
+    {
+        std::vector<std::string>* log;
+        std::string msg;
+
+        log_coro get_return_object() noexcept
+        {
+            return log_coro{
+                std::coroutine_handle<promise_type>::from_promise(*this)};
+        }
+
+        std::suspend_always initial_suspend() noexcept { return {}; }
+        std::suspend_never final_suspend() noexcept { return {}; }
+        void return_void() noexcept {}
+        void unhandled_exception() { std::terminate(); }
+    };
+
+    std::coroutine_handle<promise_type> h_;
+
+    ~log_coro() { if(h_) h_.destroy(); }
+
+    log_coro(log_coro&& o) noexcept : h_(o.h_) { o.h_ = nullptr; }
+
+    std::coroutine_handle<void> handle() const noexcept { return h_; }
+
+    void release() noexcept { h_ = nullptr; }
+
+private:
+    explicit log_coro(std::coroutine_handle<promise_type> h) : h_(h) {}
+};
+
+inline log_coro
+make_log_coro(std::vector<std::string>& log, std::string msg)
+{
+    return [](std::vector<std::string>* log, std::string msg) -> log_coro {
+        log->push_back(std::move(msg));
+        co_return;
+    }(&log, std::move(msg));
+}
+
+inline void
+pump(std::queue<std::coroutine_handle<>>& q)
+{
+    while(!q.empty())
+    {
+        auto h = q.front();
+        q.pop();
+        h.resume();
+    }
+}
+
+} // namespace
+
+struct run_priority_test
+{
+    // run(pe)(inner) from a handler on pe must enqueue inner
+    // behind other work already in pe's queue, not cut in line.
+    void
+    testForwardCrossing()
+    {
+        std::queue<std::coroutine_handle<>> q;
+        queuing_executor qe(q);
+        test::priority_executor_state state;
+        test::priority_executor pe(state, qe);
+
+        std::vector<std::string> log;
+
+        auto inner_task_fn = [&]() -> task<void> {
+            log.push_back("inner");
+            co_return;
+        };
+
+        auto outer_task_fn = [&]() -> task<void> {
+            log.push_back("outer_start");
+            co_await capy::run(pe)(inner_task_fn());
+            log.push_back("outer_end");
+        };
+
+        bool outer_done = false;
+        run_async(pe, [&]() { outer_done = true; })(outer_task_fn());
+
+        auto sibling_coro = make_log_coro(log, "sibling");
+        continuation sibling_cont{sibling_coro.handle()};
+        pe.post(sibling_cont);
+        sibling_coro.release();
+
+        pump(q);
+
+        BOOST_TEST(outer_done);
+
+        BOOST_TEST_EQ(log.size(), std::size_t(4));
+        if(log.size() == 4)
+        {
+            BOOST_TEST_EQ(log[0], std::string("outer_start"));
+            BOOST_TEST_EQ(log[1], std::string("sibling"));
+            BOOST_TEST_EQ(log[2], std::string("inner"));
+            BOOST_TEST_EQ(log[3], std::string("outer_end"));
+        }
+    }
+
+    // The return trip must post the caller back to its executor,
+    // giving pe a tick to drain higher-priority work before the
+    // caller resumes. inline_ex is chosen as the target so the
+    // forward trip is trivial and only the return trip is observed.
+    void
+    testReturnTripParentWrongFrame()
+    {
+        std::queue<std::coroutine_handle<>> q;
+        queuing_executor qe(q);
+        test::priority_executor_state state;
+        test::priority_executor pe(state, qe);
+
+        std::vector<std::string> log;
+
+        auto inner_task_fn = [&]() -> task<void> {
+            log.push_back("inner");
+            co_return;
+        };
+
+        auto outer_task_fn = [&]() -> task<void> {
+            log.push_back("outer_start");
+
+            auto pending_high_coro = make_log_coro(log, "pending_high");
+            continuation pending_high_cont{pending_high_coro.handle()};
+            pe.post_high(pending_high_cont);
+            pending_high_coro.release();
+
+            int dummy = 0;
+            test_executor inline_ex(1, dummy);
+            co_await capy::run(inline_ex)(inner_task_fn());
+
+            log.push_back("outer_end");
+        };
+
+        bool outer_done = false;
+        run_async(pe, [&]() { outer_done = true; })(outer_task_fn());
+
+        pump(q);
+
+        BOOST_TEST(outer_done);
+
+        BOOST_TEST_EQ(log.size(), std::size_t(4));
+        if(log.size() == 4)
+        {
+            BOOST_TEST_EQ(log[0], std::string("outer_start"));
+            BOOST_TEST_EQ(log[1], std::string("inner"));
+            BOOST_TEST_EQ(log[2], std::string("pending_high"));
+            BOOST_TEST_EQ(log[3], std::string("outer_end"));
+        }
+    }
+
+    // run(inner)(work) from inside a strand must actually release
+    // the strand while work runs, not nest work in the strand's frame.
+    void
+    testExitStrandOverPriority()
+    {
+        std::queue<std::coroutine_handle<>> q;
+        queuing_executor qe(q);
+        test::priority_executor_state state;
+        test::priority_executor pe(state, qe);
+
+        strand<test::priority_executor<queuing_executor>> s(pe);
+
+        bool s_running_inside_work = false;
+        bool work_ran = false;
+
+        auto work_task_fn = [&]() -> task<void> {
+            s_running_inside_work = s.running_in_this_thread();
+            work_ran = true;
+            co_return;
+        };
+
+        auto outer_task_fn = [&]() -> task<void> {
+            co_await capy::run(pe)(work_task_fn());
+        };
+
+        bool outer_done = false;
+        run_async(s, [&]() { outer_done = true; })(outer_task_fn());
+
+        pump(q);
+
+        BOOST_TEST(outer_done);
+        BOOST_TEST(work_ran);
+        BOOST_TEST(!s_running_inside_work);
+    }
+
+    void
+    run()
+    {
+        testForwardCrossing();
+        testReturnTripParentWrongFrame();
+        testExitStrandOverPriority();
+    }
+};
+
+TEST_SUITE(
+    run_priority_test,
+    "boost.capy.run.priority");
+
+} // namespace capy
+} // namespace boost
diff --git a/test/unit/ex/safe_resume.cpp b/test/unit/ex/safe_resume.cpp
new file mode 100644
index 000000000..b6fc36f68
--- /dev/null
+++ b/test/unit/ex/safe_resume.cpp
@@ -0,0 +1,175 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+// Test that header file is self-contained.
+#include <boost/capy/ex/frame_allocator.hpp>
+
+#include "test_suite.hpp"
+
+#include <coroutine>
+#include <memory_resource>
+#include <utility>
+
+namespace boost {
+namespace capy {
+
+/*
+safe_resume Tests
+=================
+
+Verifies that safe_resume saves and restores the thread-local
+frame allocator around h.resume(). A plain coroutine (no
+io_awaitable_promise_base) is used so that operator new does
+not read TLS — we only care about the TLS value itself.
+*/
+
+namespace {
+
+// Minimal coroutine that overwrites TLS with a known value, then suspends.
+struct spoiler_coro
+{
+    struct promise_type
+    {
+        std::pmr::memory_resource* spoil_to = nullptr;
+
+        spoiler_coro get_return_object()
+        {
+            return spoiler_coro{std::coroutine_handle<promise_type>::from_promise(*this)};
+        }
+
+        std::suspend_always initial_suspend() noexcept { return {}; }
+        std::suspend_always final_suspend() noexcept { return {}; }
+        void return_void() {}
+        void unhandled_exception() {}
+    };
+
+    std::coroutine_handle<promise_type> h_;
+
+    ~spoiler_coro()
+    {
+        if(h_)
+            h_.destroy();
+    }
+
+    spoiler_coro(spoiler_coro&& o) noexcept
+        : h_(std::exchange(o.h_, nullptr))
+    {
+    }
+
+    spoiler_coro& operator=(spoiler_coro&&) = delete;
+    spoiler_coro(spoiler_coro const&) = delete;
+
+private:
+    explicit spoiler_coro(std::coroutine_handle<promise_type> h)
+        : h_(h)
+    {
+    }
+
+    friend promise_type;
+};
+
+// Coroutine body: overwrite TLS and suspend.
+spoiler_coro make_spoiler(std::pmr::memory_resource* spoil_to)
+{
+    set_current_frame_allocator(spoil_to);
+    co_return;
+}
+
+// Coroutine body: overwrite TLS, then safe_resume another coroutine, then suspend.
+spoiler_coro make_nested_spoiler(
+    std::pmr::memory_resource* spoil_to,
+    std::coroutine_handle<> inner)
+{
+    set_current_frame_allocator(spoil_to);
+    safe_resume(inner);
+    // After safe_resume, TLS should be spoil_to again
+    co_return;
+}
+
+} // anonymous namespace
+
+struct safe_resume_test
+{
+    void
+    testSafeResumePreservesTLS()
+    {
+        auto* original = std::pmr::null_memory_resource();
+        auto* spoil = std::pmr::new_delete_resource();
+
+        auto coro = make_spoiler(spoil);
+
+        set_current_frame_allocator(original);
+        safe_resume(coro.h_);
+        BOOST_TEST(get_current_frame_allocator() == original);
+
+        set_current_frame_allocator(nullptr);
+    }
+
+    void
+    testSafeResumePreservesNull()
+    {
+        auto* spoil = std::pmr::new_delete_resource();
+
+        auto coro = make_spoiler(spoil);
+
+        set_current_frame_allocator(nullptr);
+        safe_resume(coro.h_);
+        BOOST_TEST(get_current_frame_allocator() == nullptr);
+    }
+
+    void
+    testRawResumeDoesNotPreserveTLS()
+    {
+        // Documents the problem that safe_resume fixes:
+        // raw .resume() lets the coroutine spoil TLS.
+        auto* original = std::pmr::null_memory_resource();
+        auto* spoil = std::pmr::new_delete_resource();
+
+        auto coro = make_spoiler(spoil);
+
+        set_current_frame_allocator(original);
+        coro.h_.resume();
+        BOOST_TEST(get_current_frame_allocator() == spoil);
+
+        set_current_frame_allocator(nullptr);
+    }
+
+    void
+    testNestedSafeResume()
+    {
+        auto* outer_value = std::pmr::null_memory_resource();
+        auto* middle_value = std::pmr::new_delete_resource();
+        auto* inner_value = std::pmr::get_default_resource();
+
+        auto inner = make_spoiler(inner_value);
+        auto outer = make_nested_spoiler(middle_value, inner.h_);
+
+        set_current_frame_allocator(outer_value);
+        safe_resume(outer.h_);
+        // Outer safe_resume should restore outer_value,
+        // regardless of what happened inside.
+        BOOST_TEST(get_current_frame_allocator() == outer_value);
+
+        set_current_frame_allocator(nullptr);
+    }
+
+    void
+    run()
+    {
+        testSafeResumePreservesTLS();
+        testSafeResumePreservesNull();
+        testRawResumeDoesNotPreserveTLS();
+        testNestedSafeResume();
+    }
+};
+
+TEST_SUITE(safe_resume_test, "capy.safe_resume");
+
+} // capy
+} // boost
diff --git a/test/unit/ex/strand.cpp b/test/unit/ex/strand.cpp
index edd362ca9..00962fc41 100644
--- a/test/unit/ex/strand.cpp
+++ b/test/unit/ex/strand.cpp
@@ -1,5 +1,6 @@
 //
 // Copyright (c) 2025 Vinnie Falco (vinnie.falco@gmail.com)
+// Copyright (c) 2026 Michael Vandeberg
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -10,9 +11,16 @@
 // Test that header file is self-contained.
 #include <boost/capy/ex/strand.hpp>
 
+// Full strand_impl definition for white-box collision tests.
+#include "src/ex/detail/strand_impl.hpp"
+
 #include <boost/capy/concept/executor.hpp>
 #include <boost/capy/ex/any_executor.hpp>
+#include <boost/capy/ex/run.hpp>
+#include <boost/capy/ex/this_coro.hpp>
 #include <boost/capy/ex/thread_pool.hpp>
+#include <boost/capy/task.hpp>
+#include <boost/capy/test/run_blocking.hpp>
 
 #include "test_suite.hpp"
 
@@ -227,6 +235,51 @@ make_order_coro(std::vector<int>& log, std::mutex& log_mutex, int id)
     }(&log, &log_mutex, id);
 }
 
+struct lifetime_coro
+{
+    struct promise_type
+    {
+        lifetime_coro
+        get_return_object() noexcept
+        {
+            return lifetime_coro{
+                std::coroutine_handle<promise_type>::from_promise(*this)};
+        }
+        std::suspend_always initial_suspend() noexcept { return {}; }
+        std::suspend_never  final_suspend()   noexcept { return {}; }
+        void return_void() noexcept {}
+        void unhandled_exception() { std::terminate(); }
+    };
+
+    std::coroutine_handle<promise_type> h_;
+
+    ~lifetime_coro() { if(h_) h_.destroy(); }
+    lifetime_coro(lifetime_coro&& other) noexcept : h_(other.h_) { other.h_ = nullptr; }
+    lifetime_coro& operator=(lifetime_coro&& other) noexcept
+    {
+        if(h_) h_.destroy();
+        h_ = other.h_;
+        other.h_ = nullptr;
+        return *this;
+    }
+
+    std::coroutine_handle<void> handle() const noexcept { return h_; }
+    void release() noexcept { h_ = nullptr; }
+
+private:
+    explicit lifetime_coro(std::coroutine_handle<promise_type> h) : h_(h) {}
+    friend lifetime_coro make_lifetime_coro(std::atomic<bool>&);
+};
+
+inline lifetime_coro
+make_lifetime_coro(std::atomic<bool>& flag)
+{
+    return [](std::atomic<bool>* f) -> lifetime_coro {
+        f->store(true);
+        co_return;
+    }(&flag);
+}
+
 } // namespace
 
 struct strand_test
@@ -324,13 +377,249 @@ struct strand_test
         auto s1 = strand(pool.get_executor());
         auto s2 = s1;
 
-        // Copies are equal
+        // Copies share the same impl.
         BOOST_TEST(s1 == s2);
 
-        // Different strands from same pool may or may not be equal
-        // depending on internal hash collision
+        // Distinct strands have distinct impls.
         auto s3 = strand(pool.get_executor());
-        (void)s3;
+        BOOST_TEST(!(s1 == s3));
+    }
+
+    void
+    testNoEqualityCollisions()
+    {
+        thread_pool pool(1);
+        constexpr int N = 1000;
+
+        std::vector<strand<thread_pool::executor_type>> strands;
+        strands.reserve(N);
+        for(int i = 0; i < N; ++i)
+            strands.push_back(strand(pool.get_executor()));
+
+        int collisions = 0;
+        for(int i = 0; i < N; ++i)
+            for(int j = i + 1; j < N; ++j)
+                if(strands[i] == strands[j])
+                    ++collisions;
+
+        BOOST_TEST_EQ(collisions, 0);
+    }
+
+    void
+    testStrandsAreIndependent()
+    {
+        // Two threads so two strands can run concurrently. Construct
+        // enough strands that the first and last would have shared an
+        // impl under the previous 211-slot pooled design; verify the
+        // new per-strand design lets them run in parallel.
+        thread_pool pool(2);
+
+        constexpr int N = 212;  // > 211 forces a hash-pool collision pre-refactor
+        std::vector<strand<thread_pool::executor_type>> strands;
+        strands.reserve(N);
+        for(int i = 0; i < N; ++i)
+            strands.push_back(strand(pool.get_executor()));
+
+        auto& sA = strands.front();
+        auto& sB = strands.back();
+
+        std::atomic<bool> a_started{false};
+        std::atomic<bool> a_done{false};
+        std::atomic<bool> b_done{false};
+
+        struct latched_coro
+        {
+            struct promise_type
+            {
+                latched_coro
+                get_return_object() noexcept
+                {
+                    return latched_coro{
+                        std::coroutine_handle<promise_type>::from_promise(*this)};
+                }
+                std::suspend_always initial_suspend() noexcept { return {}; }
+                std::suspend_never  final_suspend()   noexcept { return {}; }
+                void return_void() noexcept {}
+                void unhandled_exception() { std::terminate(); }
+            };
+            std::coroutine_handle<promise_type> h_;
+        };
+
+        auto make_latched =
+            [](std::atomic<bool>* started,
+               std::atomic<bool>& done,
+               std::chrono::milliseconds delay) -> latched_coro
+        {
+            if(started) started->store(true);
+            std::this_thread::sleep_for(delay);
+            done.store(true);
+            co_return;
+        };
+
+        auto coro_a = make_latched(
+            &a_started, a_done, std::chrono::milliseconds(200));
+        continuation ca{coro_a.h_};
+        sA.post(ca);
+        coro_a.h_ = nullptr;
+
+        // Wait until A is actively sleeping
+        BOOST_TEST(wait_for([&]{ return a_started.load(); }));
+
+        auto coro_b = make_latched(
+            nullptr, b_done, std::chrono::milliseconds(0));
+        continuation cb{coro_b.h_};
+        sB.post(cb);
+        coro_b.h_ = nullptr;
+
+        // B should complete while A is still sleeping
+        BOOST_TEST(wait_for(
+            [&]{ return b_done.load(); },
+            std::chrono::milliseconds(150)));
+        BOOST_TEST(!a_done.load());
+
+        // Let A finish so the test cleans up
+        BOOST_TEST(wait_for([&]{ return a_done.load(); }));
+    }
+
+    void
+    testTransientStrandLifetime()
+    {
+        thread_pool pool(1);
+        std::atomic<bool> done{false};
+        std::weak_ptr<detail::strand_impl> impl_weak;
+
+        // c must outlive its time in the strand queue; the strand
+        // links it intrusively rather than copying.
+        continuation c;
+        {
+            auto s = strand(pool.get_executor());
+            impl_weak = s.impl_;
+            auto coro = make_lifetime_coro(done);
+            c.h = coro.handle();
+            s.post(c);
+            coro.release();
+        }   // strand handle dropped here
+
+        BOOST_TEST(wait_for([&]{ return done.load(); }));
+        // After the invoker drains and exits, the impl shared_ptr in
+        // its coroutine frame releases. The weak_ptr should expire.
+        BOOST_TEST(wait_for([&]{ return impl_weak.expired(); }));
+    }
+
+    void
+    testManyStrandsStress()
+    {
+        thread_pool pool(4);
+        constexpr int num_strands = 10000;
+        constexpr int posts_per_strand = 3;
+
+        std::atomic<int> total{0};
+
+        std::vector<strand<thread_pool::executor_type>> strands;
+        strands.reserve(num_strands);
+        for(int i = 0; i < num_strands; ++i)
+            strands.push_back(strand(pool.get_executor()));
+
+        std::vector<counter_coro> coros;
+        coros.reserve(num_strands * posts_per_strand);
+        std::vector<continuation> conts;
+        conts.reserve(num_strands * posts_per_strand);
+
+        for(int i = 0; i < num_strands; ++i)
+        {
+            for(int j = 0; j < posts_per_strand; ++j)
+            {
+                coros.push_back(make_counter_coro(total));
+                conts.push_back({coros.back().handle()});
+                strands[i].post(conts.back());
+                coros.back().release();
+            }
+        }
+
+        BOOST_TEST(wait_for(
+            [&]{ return total.load() >= num_strands * posts_per_strand; },
+            std::chrono::milliseconds(30000)));
+        BOOST_TEST_EQ(total.load(), num_strands * posts_per_strand);
+    }
+
+    void
+    testMutexPoolCollisionIsolation()
+    {
+        // 193 mutexes in the service pool. With > 193 strands, at least
+        // two must share a mutex. Scan to find a colliding pair, then
+        // verify they run concurrently when posted to in parallel.
+        thread_pool pool(2);
+
+        constexpr int N = 200;
+        std::vector<strand<thread_pool::executor_type>> strands;
+        strands.reserve(N);
+        for(int i = 0; i < N; ++i)
+            strands.push_back(strand(pool.get_executor()));
+
+        // Find a colliding pair via the borrowed mutex pointer.
+        int idx_a = -1, idx_b = -1;
+        for(int i = 0; i < N && idx_b < 0; ++i)
+        {
+            for(int j = i + 1; j < N; ++j)
+            {
+                if(strands[i].impl_->mutex_ == strands[j].impl_->mutex_)
+                {
+                    idx_a = i;
+                    idx_b = j;
+                    break;
+                }
+            }
+        }
+        BOOST_TEST(idx_a >= 0);    // pigeonhole guarantees a hit
+        if(idx_a < 0)
+            return;
+
+        auto& sA = strands[idx_a];
+        auto& sB = strands[idx_b];
+
+        std::atomic<int> max_active{0};
+        std::atomic<int> active{0};
+        std::atomic<int> done{0};
+
+        // Each coroutine increments active, then waits at a rendezvous
+        // until both have arrived (or timeout). If colliding strands run
+        // in parallel, both observe active==2; if they serialize, the
+        // first waits the full timeout and max_active never reaches 2.
+        auto make_busy = [&]() -> counter_coro {
+            return [](std::atomic<int>* a,
+                      std::atomic<int>* m,
+                      std::atomic<int>* d) -> counter_coro
+            {
+                int cur = ++(*a);
+                int prev = m->load();
+                while(cur > prev && !m->compare_exchange_weak(prev, cur)) {}
+                auto deadline = std::chrono::steady_clock::now() +
+                    std::chrono::seconds(2);
+                while(a->load() < 2 &&
+                      std::chrono::steady_clock::now() < deadline)
+                    std::this_thread::sleep_for(std::chrono::milliseconds(1));
+                int cur2 = a->load();
+                int prev2 = m->load();
+                while(cur2 > prev2 && !m->compare_exchange_weak(prev2, cur2)) {}
+                --(*a);
+                ++(*d);
+                co_return;
+            }(&active, &max_active, &done);
+        };
+
+        auto coroA = make_busy();
+        auto coroB = make_busy();
+        continuation cA{coroA.handle()};
+        continuation cB{coroB.handle()};
+        sA.post(cA);
+        sB.post(cB);
+        coroA.release();
+        coroB.release();
+
+        BOOST_TEST(wait_for(
+            [&]{ return done.load() >= 2; },
+            std::chrono::seconds(10)));
+        BOOST_TEST_EQ(max_active.load(), 2);
     }
 
     void
@@ -352,7 +641,8 @@ struct strand_test
         std::atomic<int> counter{0};
 
         auto coro = make_counter_coro(counter);
-        s.post(coro.handle());
+        continuation c{coro.handle()};
+        s.post(c);
         coro.release();
 
         BOOST_TEST(wait_for([&]{ return counter.load() >= 1; }));
@@ -368,7 +658,8 @@ struct strand_test
         std::atomic<int> counter{0};
 
         auto coro = make_counter_coro(counter);
-        s.dispatch(coro.handle());
+        continuation c{coro.handle()};
+        s.dispatch(c);
         coro.release();
 
         BOOST_TEST(wait_for([&]{ return counter.load() >= 1; }));
@@ -386,11 +677,14 @@ struct strand_test
 
         std::vector<counter_coro> coros;
         coros.reserve(N);
+        std::vector<continuation> conts;
+        conts.reserve(N);
 
         for(int i = 0; i < N; ++i)
         {
             coros.push_back(make_counter_coro(counter));
-            s.post(coros.back().handle());
+            conts.push_back({coros.back().handle()});
+            s.post(conts.back());
             coros.back().release();
         }
 
@@ -411,16 +705,31 @@ struct strand_test
         std::vector<std::thread> threads;
         threads.reserve(num_threads);
 
+        // Storage hoisted out of the threads so each continuation
+        // outlives its time in the strand queue.
+        std::vector<std::vector<counter_coro>> coros_per_thread(num_threads);
+        std::vector<std::vector<continuation>> conts_per_thread(num_threads);
+        for(int i = 0; i < num_threads; ++i)
+        {
+            coros_per_thread[i].reserve(per_thread);
+            conts_per_thread[i].reserve(per_thread);
+        }
+
         for(int i = 0; i < num_threads; ++i)
         {
-            threads.emplace_back([&s, &counter]{
-                for(int j = 0; j < per_thread; ++j)
+            threads.emplace_back(
+                [&s, &counter,
+                 &my_coros = coros_per_thread[i],
+                 &my_conts = conts_per_thread[i]]
                 {
-                    auto coro = make_counter_coro(counter);
-                    s.post(coro.handle());
-                    coro.release();
-                }
-            });
+                    for(int j = 0; j < per_thread; ++j)
+                    {
+                        my_coros.push_back(make_counter_coro(counter));
+                        my_conts.push_back({my_coros.back().handle()});
+                        s.post(my_conts.back());
+                        my_coros.back().release();
+                    }
+                });
         }
 
         for(auto& t : threads)
@@ -443,12 +752,15 @@ struct strand_test
 
         std::vector<order_coro> coros;
         coros.reserve(N);
+        std::vector<continuation> conts;
+        conts.reserve(N);
 
         // Post coroutines with sequential IDs
         for(int i = 0; i < N; ++i)
         {
             coros.push_back(make_order_coro(log, log_mutex, i));
-            s.post(coros.back().handle());
+            conts.push_back({coros.back().handle()});
+            s.post(conts.back());
             coros.back().release();
         }
 
@@ -545,11 +857,14 @@ struct strand_test
 
         std::vector<tracking_coro> coros;
         coros.reserve(N);
+        std::vector<continuation> conts;
+        conts.reserve(N);
 
         for(int i = 0; i < N; ++i)
         {
             coros.push_back(make_tracking_coro());
-            s.post(coros.back().handle());
+            conts.push_back({coros.back().handle()});
+            s.post(conts.back());
             coros.back().release();
         }
 
@@ -560,6 +875,30 @@ struct strand_test
         BOOST_TEST_EQ(completed.load(), N);
     }
 
+    // After co_await run(strand)(...) returns, caller must be outside
+    // the strand. User-reported bug: today it is still inside.
+    void
+    testExitStrandAfterRun()
+    {
+        bool running_in_strand_after_run = true;
+        bool inner_ran = false;
+
+        test::run_blocking()([&]() -> task<void> {
+            auto ex = co_await this_coro::executor;
+            auto str = capy::strand(ex);
+
+            co_await capy::run(str)([&]() -> task<void> {
+                inner_ran = true;
+                co_return;
+            }());
+
+            running_in_strand_after_run = str.running_in_this_thread();
+        }());
+
+        BOOST_TEST(inner_ran);
+        BOOST_TEST(!running_in_strand_after_run);
+    }
+
     void
     testAnyExecutor()
     {
@@ -591,11 +930,14 @@ struct strand_test
 
             std::vector<counter_coro> coros;
             coros.reserve(N);
+            std::vector<continuation> conts;
+            conts.reserve(N);
 
             for(int i = 0; i < N; ++i)
             {
                 coros.push_back(make_counter_coro(counter));
-                s.post(coros.back().handle());
+                conts.push_back({coros.back().handle()});
+                s.post(conts.back());
                 coros.back().release();
             }
 
@@ -623,6 +965,11 @@ struct strand_test
         testContext();
         testWorkTracking();
         testEquality();
+        testNoEqualityCollisions();
+        testStrandsAreIndependent();
+        testTransientStrandLifetime();
+        testManyStrandsStress();
+        testMutexPoolCollisionIsolation();
         testRunningInThisThread();
         testPost();
         testDispatch();
@@ -630,6 +977,7 @@ struct strand_test
         testConcurrentPost();
         testFifoOrder();
         testSerialization();
+        testExitStrandAfterRun();
         testAnyExecutor();
     }
 };
diff --git a/test/unit/ex/thread_pool.cpp b/test/unit/ex/thread_pool.cpp
index 49e33860a..25d561f1b 100644
--- a/test/unit/ex/thread_pool.cpp
+++ b/test/unit/ex/thread_pool.cpp
@@ -13,10 +13,14 @@
 
 #include <boost/capy/concept/execution_context.hpp>
 #include <boost/capy/concept/executor.hpp>
+#include <boost/capy/ex/run_async.hpp>
+#include <boost/capy/ex/work_guard.hpp>
 
 #include "test_helpers.hpp"
 
+#include <array>
 #include <atomic>
+#include <thread>
 #include <vector>
 
 namespace boost {
@@ -45,6 +49,46 @@ struct test_service : execution_context::service
     void shutdown() override {}
 };
 
+// Probe coroutine starts suspended; resuming it completes and
+// auto-destroys the frame (suspend_never final). If never
+// resumed, probe_coro's dtor destroys it.
+struct probe_coro
+{
+    struct promise_type
+    {
+        probe_coro
+        get_return_object() noexcept
+        {
+            return probe_coro{
+                std::coroutine_handle<promise_type>::from_promise(*this)};
+        }
+        std::suspend_always initial_suspend() noexcept { return {}; }
+        std::suspend_never final_suspend() noexcept { return {}; }
+        void return_void() noexcept {}
+        void unhandled_exception() { std::terminate(); }
+    };
+
+    std::coroutine_handle<promise_type> h_;
+
+    ~probe_coro() { if(h_) h_.destroy(); }
+
+    probe_coro(probe_coro&& other) noexcept
+        : h_(other.h_) { other.h_ = nullptr; }
+
+    std::coroutine_handle<void> handle() const noexcept { return h_; }
+    void release() noexcept { h_ = nullptr; }
+
+private:
+    explicit probe_coro(std::coroutine_handle<promise_type> h)
+        : h_(h) {}
+};
+
+inline probe_coro
+make_probe()
+{
+    co_return;
+}
+
 #if defined(BOOST_CAPY_TEST_CAN_GET_THREAD_NAME)
 // Result storage for thread name check
 struct name_check_result
@@ -157,11 +201,13 @@ struct thread_pool_test
     void
     testPostWork()
     {
+        // continuation must outlive pool (LIFO destruction order)
+        continuation c{std::noop_coroutine()};
         thread_pool pool(1);
         auto ex = pool.get_executor();
 
         // Post a noop coroutine and verify no exceptions
-        ex.post(std::noop_coroutine());
+        ex.post(c);
 
         // Basic test: pool constructs and destructs without issue
         (void)ex;
@@ -183,11 +229,95 @@ struct thread_pool_test
     void
     testDispatch()
     {
-        thread_pool pool(1);
-        auto ex = pool.get_executor();
+        // From outside any pool, dispatch() posts.
+        auto probe = make_probe();
+        auto probe_h = probe.handle();
+        auto* target = new continuation{probe_h};
 
-        // dispatch() always posts for thread_pool (returns void)
-        ex.dispatch(std::noop_coroutine());
+        std::coroutine_handle<> returned;
+        {
+            thread_pool pool(1);
+            auto ex = pool.get_executor();
+            returned = ex.dispatch(*target);
+        }
+
+        BOOST_TEST(returned != probe_h);
+        if(returned != probe_h)
+            probe.release();
+        delete target;
+    }
+
+    void
+    testDispatchSymmetricTransfer()
+    {
+        // From a worker thread of the same pool, dispatch()
+        // returns c.h for symmetric transfer and does not
+        // enqueue the continuation.
+        auto probe = make_probe();
+        auto probe_h = probe.handle();
+
+        // Heap-allocated so target outlives the pool if a buggy
+        // implementation erroneously posts it.
+        auto* target = new continuation{probe_h};
+
+        std::atomic<bool> done{false};
+        std::coroutine_handle<> returned;
+
+        {
+            thread_pool pool(1);
+            auto ex = pool.get_executor();
+
+            run_async(ex, [&]{
+                returned = ex.dispatch(*target);
+                done.store(true);
+            })(void_task());
+
+            BOOST_TEST(wait_for([&]{ return done.load(); }));
+        }
+
+        // On symmetric transfer the returned handle equals the
+        // target's handle and the probe is never enqueued.
+        BOOST_TEST(returned == probe_h);
+
+        // If the dispatch posted (buggy), the pool destructor's
+        // drain_abandoned already destroyed probe_h; release so
+        // the probe_coro dtor does not double-destroy.
+        if(returned != probe_h)
+            probe.release();
+        delete target;
+    }
+
+    void
+    testDispatchCrossPool()
+    {
+        // Worker threads of pool A are not workers of pool B:
+        // dispatch() on B from an A worker must post, not
+        // symmetric-transfer.
+        auto probe = make_probe();
+        auto probe_h = probe.handle();
+        auto* target = new continuation{probe_h};
+
+        std::atomic<bool> done{false};
+        std::coroutine_handle<> returned;
+
+        {
+            thread_pool pool_a(1);
+            thread_pool pool_b(1);
+            auto ex_a = pool_a.get_executor();
+            auto ex_b = pool_b.get_executor();
+
+            run_async(ex_a, [&]{
+                returned = ex_b.dispatch(*target);
+                done.store(true);
+            })(void_task());
+
+            BOOST_TEST(wait_for([&]{ return done.load(); }));
+        }
+
+        BOOST_TEST(returned != probe_h);
+        if(returned != probe_h)
+            probe.release();
+        delete target;
     }
 
     void
@@ -230,10 +360,18 @@ struct thread_pool_test
     void
     testConcurrentPost()
     {
+        // Pre-allocate continuations: must outlive the pool
+        // (LIFO destruction order).
+        constexpr int num_threads = 8;
+        constexpr int posts_per_thread = 10;
+        std::vector<std::array<continuation, posts_per_thread>> all_conts(num_threads);
+        for(auto& arr : all_conts)
+            for(auto& c : arr)
+                c.h = std::noop_coroutine();
+
         thread_pool pool(4);
         auto ex = pool.get_executor();
 
-        constexpr int num_threads = 8;
         std::atomic<int> post_count{0};
 
         std::vector<std::thread> threads;
@@ -241,11 +379,11 @@ struct thread_pool_test
 
         for(int i = 0; i < num_threads; ++i)
         {
-            threads.emplace_back([&ex, &post_count]{
+            threads.emplace_back([&ex, &post_count, conts = all_conts[i].data()]{
                 // Multiple threads posting concurrently
-                for(int j = 0; j < 10; ++j)
+                for(int j = 0; j < posts_per_thread; ++j)
                 {
-                    ex.post(std::noop_coroutine());
+                    ex.post(conts[j]);
                     ++post_count;
                 }
             });
@@ -287,9 +425,11 @@ struct thread_pool_test
 #if defined(BOOST_CAPY_TEST_CAN_GET_THREAD_NAME)
         // Verify default thread name from within pool thread
         {
-            thread_pool pool(1);
             name_check_result result;
-            pool.get_executor().post(check_thread_name(result, "capy-pool-"));
+            auto nc = check_thread_name(result, "capy-pool-");
+            continuation c{nc.h};
+            thread_pool pool(1);
+            pool.get_executor().post(c);
 
             BOOST_TEST(wait_for([&]{ return result.done.load(); }));
             BOOST_TEST(result.matches.load());
@@ -297,9 +437,11 @@ struct thread_pool_test
 
         // Verify custom thread name from within pool thread
         {
-            thread_pool pool(1, "mypool-");
             name_check_result result;
-            pool.get_executor().post(check_thread_name(result, "mypool-"));
+            auto nc = check_thread_name(result, "mypool-");
+            continuation c{nc.h};
+            thread_pool pool(1, "mypool-");
+            pool.get_executor().post(c);
 
             BOOST_TEST(wait_for([&]{ return result.done.load(); }));
             BOOST_TEST(result.matches.load());
@@ -307,9 +449,11 @@ struct thread_pool_test
 
         // Verify thread naming works with index suffix
         {
-            thread_pool pool(1, "idx-");
             name_check_result result;
-            pool.get_executor().post(check_thread_name(result, "idx-0"));
+            auto nc = check_thread_name(result, "idx-0");
+            continuation c{nc.h};
+            thread_pool pool(1, "idx-");
+            pool.get_executor().post(c);
 
             BOOST_TEST(wait_for([&]{ return result.done.load(); }));
             BOOST_TEST(result.matches.load());
@@ -317,6 +461,242 @@ struct thread_pool_test
 #endif
     }
 
+    void
+    testJoinDrainsWork()
+    {
+        thread_pool pool(2);
+        auto ex = pool.get_executor();
+        std::atomic<int> count{0};
+
+        constexpr int N = 50;
+        for(int i = 0; i < N; ++i)
+        {
+            run_async(ex,
+                [&]{ count.fetch_add(1); }
+            )(void_task());
+        }
+
+        pool.join();
+        BOOST_TEST_EQ(count.load(), N);
+    }
+
+    void
+    testJoinNoWork()
+    {
+        // join() on a pool with no posted work returns promptly
+        thread_pool pool(2);
+        pool.join();
+    }
+
+    void
+    testJoinNoThreadsStarted()
+    {
+        // join() without ever posting (lazy start never triggered)
+        thread_pool pool(2);
+        // Don't call get_executor() or post anything
+        pool.join();
+    }
+
+    void
+    testJoinIdempotent()
+    {
+        thread_pool pool(1);
+        pool.join();
+        pool.join();  // second call should be a no-op
+    }
+
+    void
+    testStopThenJoin()
+    {
+        thread_pool pool(2);
+        pool.stop();
+        pool.join();  // should return immediately
+    }
+
+    void
+    testStopInterruptsJoin()
+    {
+        thread_pool pool(2);
+        auto ex = pool.get_executor();
+
+        // Hold work guard to keep join() blocking
+        auto guard = make_work_guard(ex);
+
+        std::atomic<bool> join_returned{false};
+        std::thread joiner([&]{
+            pool.join();
+            join_returned.store(true);
+        });
+
+        // Give join() time to block
+        std::this_thread::sleep_for(
+            std::chrono::milliseconds(50));
+        BOOST_TEST(!join_returned.load());
+
+        // stop() should interrupt the blocking join()
+        pool.stop();
+
+        joiner.join();
+        BOOST_TEST(join_returned.load());
+    }
+
+    void
+    testDestructorAbandonsPending()
+    {
+        // Verify the destructor doesn't hang when work items
+        // are genuinely queued but unprocessed. We block the
+        // single worker thread with a spinning callback, then
+        // post items that pile up in the queue. After releasing
+        // the worker, the destructor's stop() causes it to exit
+        // without draining the queue.
+        {
+            std::atomic<bool> busy{false};
+            std::atomic<bool> release{false};
+            std::array<continuation, 50> conts;
+
+            thread_pool pool(1);
+            auto ex = pool.get_executor();
+
+            // Block the worker via run_async callback
+            run_async(ex, [&]{
+                busy.store(true);
+                while(!release.load())
+                    std::this_thread::yield();
+            })(void_task());
+
+            // Wait until worker is executing our callback
+            while(!busy.load())
+                std::this_thread::yield();
+
+            // Queue items that can't be processed yet
+            for(int i = 0; i < 50; ++i)
+            {
+                conts[i].h = std::noop_coroutine();
+                ex.post(conts[i]);
+            }
+
+            // Release worker, then pool destructs immediately.
+            // stop() races with the worker — pending items
+            // are abandoned and destroyed by ~impl().
+            release.store(true);
+        }
+    }
+
+    void
+    testStopCallbackPostBack()
+    {
+        // Cancel a suspended task via stop_token, then let the
+        // pool destruct. stop_only_awaitable uses resume_via_post
+        // so the coroutine resumes on a pool thread, not on the
+        // thread that calls request_stop().
+        {
+            thread_pool pool(1);
+            auto ex = pool.get_executor();
+            std::stop_source ss;
+
+            auto make_task = []() -> task<void> {
+                co_await stop_only_awaitable{};
+            };
+
+            run_async(ex, ss.get_token())(make_task());
+
+            std::this_thread::sleep_for(
+                std::chrono::milliseconds(50));
+
+            ss.request_stop();
+        }
+    }
+
+    void
+    testStopCallbackWithJoin()
+    {
+        // Cancel a suspended task, then join() the pool.
+        // Verifies work counting and join() interact correctly
+        // with stop_callback cancellation.
+        {
+            thread_pool pool(1);
+            auto ex = pool.get_executor();
+            std::stop_source ss;
+
+            auto make_task = []() -> task<void> {
+                co_await stop_only_awaitable{};
+            };
+
+            run_async(ex, ss.get_token())(make_task());
+
+            std::this_thread::sleep_for(
+                std::chrono::milliseconds(50));
+
+            ss.request_stop();
+            pool.join();
+        }
+    }
+
+    void
+    testStopCallbackRepeated()
+    {
+        // Stress test: repeated cancel + pool destruction cycles.
+        for(int iter = 0; iter < 50; ++iter)
+        {
+            thread_pool pool(2);
+            auto ex = pool.get_executor();
+            std::stop_source ss;
+
+            auto make_task = []() -> task<void> {
+                co_await stop_only_awaitable{};
+            };
+
+            for(int i = 0; i < 5; ++i)
+                run_async(ex, ss.get_token())(make_task());
+
+            std::this_thread::sleep_for(
+                std::chrono::milliseconds(10));
+
+            ss.request_stop();
+        }
+    }
+
+    void
+    testWorkGuardKeepsPoolAlive()
+    {
+        thread_pool pool(1);
+        auto ex = pool.get_executor();
+        std::atomic<bool> join_returned{false};
+
+        auto guard = make_work_guard(ex);
+
+        std::thread joiner([&]{
+            pool.join();
+            join_returned.store(true);
+        });
+
+        // Give join() time to block
+        std::this_thread::sleep_for(
+            std::chrono::milliseconds(50));
+        BOOST_TEST(!join_returned.load());
+
+        // Releasing the guard should allow join() to complete
+        guard.reset();
+
+        joiner.join();
+        BOOST_TEST(join_returned.load());
+    }
+
+    void
+    testJoinWithRunAsync()
+    {
+        thread_pool pool(2);
+        auto ex = pool.get_executor();
+        std::atomic<int> result{0};
+
+        run_async(ex,
+            [&](int v){ result.store(v); }
+        )(returns_int(42));
+
+        pool.join();
+        BOOST_TEST_EQ(result.load(), 42);
+    }
+
     void
     run()
     {
@@ -327,11 +707,25 @@ struct thread_pool_test
         testPostWork();
         testWorkCounting();
         testDispatch();
+        testDispatchSymmetricTransfer();
+        testDispatchCrossPool();
         testServiceManagement();
         testMakeService();
         testConcurrentPost();
         testDefaultExecutor();
         testThreadNaming();
+        testJoinDrainsWork();
+        testJoinNoWork();
+        testJoinNoThreadsStarted();
+        testJoinIdempotent();
+        testStopThenJoin();
+        testStopInterruptsJoin();
+        testDestructorAbandonsPending();
+        testStopCallbackPostBack();
+        testStopCallbackWithJoin();
+        testStopCallbackRepeated();
+        testWorkGuardKeepsPoolAlive();
+        testJoinWithRunAsync();
     }
 };
 
diff --git a/test/unit/ex/work_guard.cpp b/test/unit/ex/work_guard.cpp
index a5f04901f..fe0ba778f 100644
--- a/test/unit/ex/work_guard.cpp
+++ b/test/unit/ex/work_guard.cpp
@@ -64,13 +64,13 @@ struct guard_test_executor
     }
 
     std::coroutine_handle<>
-    dispatch(std::coroutine_handle<> h) const
+    dispatch(continuation& c) const
     {
-        return h;
+        return c.h;
     }
 
     void
-    post(std::coroutine_handle<>) const
+    post(continuation&) const
     {
     }
 };
diff --git a/test/unit/io/write_now.cpp b/test/unit/io/write_now.cpp
index a2285a5ff..eadb02385 100644
--- a/test/unit/io/write_now.cpp
+++ b/test/unit/io/write_now.cpp
@@ -10,7 +10,6 @@
 // Test that header file is self-contained.
 #include <boost/capy/io/write_now.hpp>
 
-#include <boost/capy/buffers/buffer_pair.hpp>
 #include <boost/capy/buffers/make_buffer.hpp>
 #include <boost/capy/error.hpp>
 #include <boost/capy/test/fuse.hpp>
@@ -148,7 +147,7 @@ class write_now_test
 
             std::string s1("ab");
             std::string s2("cdefgh");
-            const_buffer_pair bp{{
+            std::array<const_buffer, 2> bp{{
                 const_buffer(s1.data(), s1.size()),
                 const_buffer(s2.data(), s2.size())
             }};
diff --git a/test/unit/io_result.cpp b/test/unit/io_result.cpp
index f18a66eee..9c01edc07 100644
--- a/test/unit/io_result.cpp
+++ b/test/unit/io_result.cpp
@@ -41,18 +41,18 @@ struct io_result_test
         // Default construction
         io_result<std::size_t> r1;
         BOOST_TEST(!r1.ec);
-        BOOST_TEST_EQ(r1.t1, 0u);
+        BOOST_TEST_EQ(std::get<0>(r1.values), 0u);
 
         // With values
         io_result<std::size_t> r2{{}, 42};
         BOOST_TEST(!r2.ec);
-        BOOST_TEST_EQ(r2.t1, 42u);
+        BOOST_TEST_EQ(std::get<0>(r2.values), 42u);
 
         // With error
         io_result<std::size_t> r3{
             make_error_code(std::errc::invalid_argument), 10};
         BOOST_TEST(r3.ec);
-        BOOST_TEST_EQ(r3.t1, 10u);
+        BOOST_TEST_EQ(std::get<0>(r3.values), 10u);
 
         // Structured binding
         auto [ec, n] = r2;
@@ -66,7 +66,7 @@ struct io_result_test
         // With string value
         io_result<std::string> r1{{}, "hello"};
         BOOST_TEST(!r1.ec);
-        BOOST_TEST_EQ(r1.t1, "hello");
+        BOOST_TEST_EQ(std::get<0>(r1.values), "hello");
 
         // Structured binding
         auto [ec, v] = r1;
@@ -77,7 +77,7 @@ struct io_result_test
         io_result<std::string> r2{
             make_error_code(std::errc::invalid_argument), "error"};
         BOOST_TEST(r2.ec);
-        BOOST_TEST_EQ(r2.t1, "error");
+        BOOST_TEST_EQ(std::get<0>(r2.values), "error");
     }
 
     void
@@ -87,9 +87,9 @@ struct io_result_test
         io_result<int, double, std::string> r1{
             {}, 42, 3.14, std::string("test")};
         BOOST_TEST(!r1.ec);
-        BOOST_TEST_EQ(r1.t1, 42);
-        BOOST_TEST_EQ(r1.t2, 3.14);
-        BOOST_TEST_EQ(r1.t3, "test");
+        BOOST_TEST_EQ(std::get<0>(r1.values), 42);
+        BOOST_TEST_EQ(std::get<1>(r1.values), 3.14);
+        BOOST_TEST_EQ(std::get<2>(r1.values), "test");
 
         // Structured binding
         auto [ec, a, b, c] = r1;
@@ -102,8 +102,35 @@ struct io_result_test
         io_result<int, double> r2{
             make_error_code(std::errc::invalid_argument), 0, 0.0};
         BOOST_TEST(r2.ec);
-        BOOST_TEST_EQ(r2.t1, 0);
-        BOOST_TEST_EQ(r2.t2, 0.0);
+        BOOST_TEST_EQ(std::get<0>(r2.values), 0);
+        BOOST_TEST_EQ(std::get<1>(r2.values), 0.0);
+    }
+
+    void
+    testFourPlusArgs()
+    {
+        // Verify no arity limit
+        io_result<int, double, std::string, bool> r1{
+            {}, 1, 2.5, std::string("hi"), true};
+        BOOST_TEST(!r1.ec);
+        BOOST_TEST_EQ(std::get<0>(r1.values), 1);
+        BOOST_TEST_EQ(std::get<1>(r1.values), 2.5);
+        BOOST_TEST_EQ(std::get<2>(r1.values), "hi");
+        BOOST_TEST_EQ(std::get<3>(r1.values), true);
+
+        // Structured binding
+        auto [ec, a, b, c, d] = r1;
+        BOOST_TEST(!ec);
+        BOOST_TEST_EQ(a, 1);
+        BOOST_TEST_EQ(b, 2.5);
+        BOOST_TEST_EQ(c, "hi");
+        BOOST_TEST_EQ(d, true);
+
+        // Default construction
+        io_result<int, double, std::string, bool> r2;
+        BOOST_TEST(!r2.ec);
+        BOOST_TEST_EQ(std::get<0>(r2.values), 0);
+        BOOST_TEST_EQ(std::get<3>(r2.values), false);
     }
 
     void
@@ -113,6 +140,7 @@ struct io_result_test
         testSizeResult();
         testGenericSingleValue();
         testMultiValue();
+        testFourPlusArgs();
     }
 };
 
diff --git a/test/unit/quitter.cpp b/test/unit/quitter.cpp
new file mode 100644
index 000000000..52fe86528
--- /dev/null
+++ b/test/unit/quitter.cpp
@@ -0,0 +1,819 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+// Test that header file is self-contained.
+#include <boost/capy/quitter.hpp>
+
+#include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/delay.hpp>
+#include <boost/capy/error.hpp>
+#include <boost/capy/ex/io_env.hpp>
+#include <boost/capy/ex/run_async.hpp>
+#include <boost/capy/ex/this_coro.hpp>
+#include <boost/capy/ex/thread_pool.hpp>
+#include <boost/capy/io_task.hpp>
+#include <boost/capy/task.hpp>
+#include <boost/capy/test/run_blocking.hpp>
+#include <boost/capy/test/stream.hpp>
+#include <boost/capy/when_all.hpp>
+#include <boost/capy/when_any.hpp>
+
+#include "test_helpers.hpp"
+
+#include <atomic>
+#include <latch>
+#include <semaphore>
+#include <string>
+#include <string_view>
+#include <system_error>
+#include <thread>
+#include <variant>
+
+namespace boost {
+namespace capy {
+
+static_assert(IoAwaitable<quitter<void>>);
+static_assert(IoAwaitable<quitter<int>>);
+static_assert(IoRunnable<quitter<void>>);
+static_assert(IoRunnable<quitter<int>>);
+
+struct quitter_test
+{
+    //----------------------------------------------------------
+    // 1. Normal completion — quitter<int> returns a value
+    //----------------------------------------------------------
+
+    static quitter<int>
+    returns_int()
+    {
+        co_return 42;
+    }
+
+    void
+    testNormalCompletion()
+    {
+        int result = 0;
+        test::run_blocking([&](int v) { result = v; })(returns_int());
+        BOOST_TEST_EQ(result, 42);
+    }
+
+    //----------------------------------------------------------
+    // 2. Void completion
+    //----------------------------------------------------------
+
+    static quitter<>
+    void_quitter()
+    {
+        co_return;
+    }
+
+    void
+    testVoidCompletion()
+    {
+        test::run_blocking()(void_quitter());
+    }
+
+    //----------------------------------------------------------
+    // 3. Exception propagation
+    //----------------------------------------------------------
+
+    static quitter<>
+    throws_quitter()
+    {
+        throw test_exception("quitter exception");
+        co_return;
+    }
+
+    void
+    testExceptionPropagation()
+    {
+        BOOST_TEST_THROWS(
+            test::run_blocking()(throws_quitter()),
+            test_exception);
+    }
+
+    //----------------------------------------------------------
+    // 4. Stop before first co_await
+    //----------------------------------------------------------
+
+    struct raii_counter
+    {
+        int* count;
+        raii_counter(int& c) : count(&c) {}
+        raii_counter(raii_counter&& o) noexcept
+            : count(std::exchange(o.count, nullptr)) {}
+        ~raii_counter() { if(count) ++(*count); }
+    };
+
+    static quitter<>
+    quitter_with_raii(int& dtor_count)
+    {
+        raii_counter guard(dtor_count);
+        co_await stop_only_awaitable{};
+    }
+
+    void
+    testStopBeforeFirstAwait()
+    {
+        // When stop is already requested, initial_suspend throws
+        // before the coroutine body starts.  Body locals are never
+        // constructed, so dtor_count stays 0.  The stopped state
+        // routes to the error handler via exception().
+        int dispatch_count = 0;
+        test_executor ex(dispatch_count);
+        std::stop_source source;
+        source.request_stop();
+
+        int dtor_count = 0;
+        bool got_stopped = false;
+
+        run_async(ex, source.get_token(),
+            [](){ BOOST_TEST(false); },
+            [&](std::exception_ptr ep) {
+                got_stopped = (ep != nullptr);
+            })(quitter_with_raii(dtor_count));
+
+        BOOST_TEST(got_stopped);
+        // Body never started, so no destructors ran
+        BOOST_TEST_EQ(dtor_count, 0);
+    }
+
+    //----------------------------------------------------------
+    // 5. Stop during I/O
+    //----------------------------------------------------------
+
+    void
+    testStopDuringIO()
+    {
+        std::atomic<int> state = 0;
+        std::binary_semaphore suspended{0};
+
+        int dtor_count = 0;
+
+        auto q = [&]() -> quitter<>
+        {
+            raii_counter guard(dtor_count);
+            state = 1;
+            suspended.release();
+            co_await stop_only_awaitable{};
+            // Should never reach here when stopped
+            state = 99;
+        };
+
+        {
+            std::jthread jt(
+                [&](std::stop_token st)
+                {
+                    int dc = 0;
+                    test_executor ex(dc);
+                    run_async(ex, st)(q());
+                }
+            );
+            suspended.acquire();
+            BOOST_TEST(state == 1);
+            // jthread destructor calls request_stop() then join()
+        }
+
+        // The coroutine was stopped; it should NOT have reached state=99
+        BOOST_TEST(state != 99);
+        BOOST_TEST_EQ(dtor_count, 1);
+    }
+
+    //----------------------------------------------------------
+    // 6. Stop propagation through chain
+    //----------------------------------------------------------
+
+    static quitter<>
+    inner_quitter(
+        int& dtor_count,
+        bool& reached_end,
+        std::binary_semaphore& suspended)
+    {
+        raii_counter guard(dtor_count);
+        suspended.release();
+        co_await stop_only_awaitable{};
+        reached_end = true;
+    }
+
+    static quitter<>
+    middle_quitter(
+        int& dtor_count,
+        bool& reached_end,
+        std::binary_semaphore& suspended)
+    {
+        raii_counter guard(dtor_count);
+        co_await inner_quitter(dtor_count, reached_end, suspended);
+        reached_end = true;
+    }
+
+    static quitter<>
+    outer_quitter(
+        int& dtor_count,
+        bool& reached_end,
+        std::binary_semaphore& suspended)
+    {
+        raii_counter guard(dtor_count);
+        co_await middle_quitter(dtor_count, reached_end, suspended);
+        reached_end = true;
+    }
+
+    void
+    testStopPropagationChain()
+    {
+        std::binary_semaphore suspended{0};
+        int dtor_count = 0;
+        bool reached_end = false;
+
+        auto top = [&]() -> quitter<>
+        {
+            co_await outer_quitter(
+                dtor_count, reached_end, suspended);
+            reached_end = true;
+        };
+
+        {
+            std::jthread jt(
+                [&](std::stop_token st)
+                {
+                    int dc = 0;
+                    test_executor ex(dc);
+                    run_async(ex, st)(top());
+                }
+            );
+            suspended.acquire();
+        }
+
+        // 3 guards from outer/middle/inner
+        BOOST_TEST_EQ(dtor_count, 3);
+        // No coroutine body continued past its co_await
+        BOOST_TEST(!reached_end);
+    }
+
+    //----------------------------------------------------------
+    // 9. Mixing quitter and task — task awaits quitter
+    //----------------------------------------------------------
+
+    static quitter<int>
+    quitter_returns_42()
+    {
+        co_return 42;
+    }
+
+    static task<int>
+    task_awaits_quitter()
+    {
+        int v = co_await quitter_returns_42();
+        co_return v + 1;
+    }
+
+    void
+    testMixingQuitterAndTask()
+    {
+        // Normal case: task awaits quitter that completes normally
+        {
+            int result = 0;
+            test::run_blocking(
+                [&](int v) { result = v; })(task_awaits_quitter());
+            BOOST_TEST_EQ(result, 43);
+        }
+
+        // Stopped case: task awaits stopped quitter — sees exception
+        {
+            std::stop_source source;
+            source.request_stop();
+
+            int dispatch_count = 0;
+            test_executor ex(dispatch_count);
+            bool got_exception = false;
+
+            auto t = []() -> task<int>
+            {
+                co_return co_await quitter_returns_42();
+            };
+
+            run_async(ex, source.get_token(),
+                [](int) { BOOST_TEST(false); },
+                [&](std::exception_ptr ep) {
+                    got_exception = (ep != nullptr);
+                })(t());
+
+            BOOST_TEST(got_exception);
+        }
+    }
+
+    //----------------------------------------------------------
+    // 10. No stop requested — identical to task<T>
+    //----------------------------------------------------------
+
+    static quitter<int>
+    quitter_chain()
+    {
+        auto inner = []() -> quitter<int> {
+            co_return 10;
+        };
+
+        auto middle = [inner]() -> quitter<int> {
+            int v = co_await inner();
+            co_return v * 2;
+        };
+
+        int v = co_await middle();
+        co_return v + 5;
+    }
+
+    void
+    testNoStopRequested()
+    {
+        int result = 0;
+        test::run_blocking(
+            [&](int v) { result = v; })(quitter_chain());
+        BOOST_TEST_EQ(result, 25);
+    }
+
+    //----------------------------------------------------------
+    // 11. RAII verification
+    //----------------------------------------------------------
+
+    void
+    testRAIIVerification()
+    {
+        // Body starts, constructs guards, then stop is requested
+        // during the co_await.  All guard destructors must run.
+        // The body must NOT continue past co_await.
+        std::binary_semaphore suspended{0};
+        int dtor_count = 0;
+        bool reached_end = false;
+
+        auto q = [&]() -> quitter<>
+        {
+            raii_counter g1(dtor_count);
+            raii_counter g2(dtor_count);
+            raii_counter g3(dtor_count);
+            suspended.release();
+            co_await stop_only_awaitable{};
+            reached_end = true;
+        };
+
+        {
+            std::jthread jt(
+                [&](std::stop_token st)
+                {
+                    int dc = 0;
+                    test_executor ex(dc);
+                    run_async(ex, st)(q());
+                }
+            );
+            suspended.acquire();
+        }
+
+        BOOST_TEST_EQ(dtor_count, 3);
+        BOOST_TEST(!reached_end);
+    }
+
+    //----------------------------------------------------------
+    // 12. Multiple co_await — stop after second
+    //----------------------------------------------------------
+
+    static quitter<int>
+    quitter_multi_await(
+        std::atomic<int>& progress,
+        std::binary_semaphore& sem)
+    {
+        progress = 1;
+        co_await yield_awaitable{};
+        progress = 2;
+        sem.release();
+        co_await stop_only_awaitable{};
+        // Should not reach here
+        progress = 3;
+        co_return 0;
+    }
+
+    void
+    testMultipleCoAwait()
+    {
+        std::atomic<int> progress{0};
+        std::binary_semaphore sem{0};
+
+        {
+            std::jthread jt(
+                [&](std::stop_token st)
+                {
+                    int dc = 0;
+                    test_executor ex(dc);
+                    run_async(ex, st)(
+                        quitter_multi_await(progress, sem));
+                }
+            );
+            sem.acquire();
+            BOOST_TEST(progress == 2);
+            // jthread destructor requests stop
+        }
+
+        // The third await should have been short-circuited
+        BOOST_TEST(progress != 3);
+    }
+
+    //----------------------------------------------------------
+    // Move operations
+    //----------------------------------------------------------
+
+    void
+    testMoveOperations()
+    {
+        int dispatch_count = 0;
+        test_executor ex(dispatch_count);
+        io_env env{executor_ref(ex), {}, nullptr};
+
+        // move constructor
+        {
+            auto q1 = returns_int();
+            auto h1 = q1.handle();
+            q1.release();
+            BOOST_TEST(h1);
+
+            quitter<int> q2(std::move(q1));
+            BOOST_TEST(!q2.handle());
+
+            h1.promise().set_environment(&env);
+            while(!h1.done())
+                h1.resume();
+            BOOST_TEST_EQ(*h1.promise().result_, 42);
+            h1.destroy();
+        }
+
+        // release()
+        {
+            auto q = returns_int();
+            auto h = q.handle();
+            q.release();
+            BOOST_TEST(h);
+            BOOST_TEST(!q.handle());
+
+            h.promise().set_environment(&env);
+            while(!h.done())
+                h.resume();
+            BOOST_TEST(h.promise().result_.has_value());
+            BOOST_TEST_EQ(*h.promise().result_, 42);
+            h.destroy();
+        }
+    }
+
+    //----------------------------------------------------------
+    // Quitter returning string
+    //----------------------------------------------------------
+
+    static quitter<std::string>
+    returns_string()
+    {
+        co_return "hello";
+    }
+
+    void
+    testReturnString()
+    {
+        std::string result;
+        test::run_blocking(
+            [&](std::string v) { result = std::move(v); })(
+                returns_string());
+        BOOST_TEST_EQ(result, "hello");
+    }
+
+    //----------------------------------------------------------
+    // Exception in quitter<int>
+    //----------------------------------------------------------
+
+    static quitter<int>
+    quitter_throws_int()
+    {
+        throw test_exception("quitter int exception");
+        co_return 0;
+    }
+
+    void
+    testExceptionInValueQuitter()
+    {
+        BOOST_TEST_THROWS(
+            test::run_blocking()(quitter_throws_int()),
+            test_exception);
+    }
+
+    //----------------------------------------------------------
+    // 7. Stop propagation with when_all
+    //
+    //    Two quitter<io_result<size_t>> children inside when_all.
+    //    Both block on stop_only_awaitable.  when_all creates a
+    //    child stop_source; when the parent stop fires, when_all
+    //    propagates it.  Each quitter child intercepts the stop
+    //    in transform_awaiter::await_resume and short-circuits
+    //    via stop_requested_exception — it never reaches the
+    //    co_return.  Verify both stop and when_all completes.
+    //----------------------------------------------------------
+
+    static quitter<io_result<std::size_t>>
+    quitter_pending_size(bool& reached_co_return)
+    {
+        co_await stop_only_awaitable{};
+        // If quitter's transform_awaiter intercepted the stop,
+        // we never reach here.
+        reached_co_return = true;
+        co_return io_result<std::size_t>{
+            make_error_code(error::canceled), 0};
+    }
+
+    void
+    testWhenAllWithStop()
+    {
+        thread_pool pool(2);
+        std::latch done(1);
+        std::latch suspended(1);
+        std::stop_source source;
+
+        bool child1_returned = false;
+        bool child2_returned = false;
+
+        auto outer = [&]() -> task<>
+        {
+            suspended.count_down();
+            auto result = co_await when_all(
+                quitter_pending_size(child1_returned),
+                quitter_pending_size(child2_returned));
+            (void)result;
+        };
+
+        run_async(pool.get_executor(), source.get_token(),
+            [&]() { done.count_down(); },
+            [&](std::exception_ptr) { done.count_down(); })(
+                outer());
+
+        suspended.wait();
+        std::this_thread::sleep_for(
+            std::chrono::milliseconds(10 * failsafe_scale));
+        source.request_stop();
+
+        done.wait();
+        // Both quitter children were stopped by transform_awaiter
+        // before reaching co_return.
+        BOOST_TEST(!child1_returned);
+        BOOST_TEST(!child2_returned);
+    }
+
+    //----------------------------------------------------------
+    // 8. Stop propagation with when_any
+    //
+    //    Two quitter children.  One succeeds immediately,
+    //    when_any stops the sibling.  The sibling quitter
+    //    intercepts the stop and exits cleanly.
+    //----------------------------------------------------------
+
+    static quitter<io_result<std::size_t>>
+    quitter_success_size(std::size_t n)
+    {
+        co_return io_result<std::size_t>{{}, n};
+    }
+
+    void
+    testWhenAnyWithStop()
+    {
+        // One child succeeds immediately.  when_any stops
+        // the pending sibling quitter.  The sibling must
+        // be intercepted by transform_awaiter (never reach
+        // co_return).
+        {
+            thread_pool pool(2);
+            std::latch done(1);
+            bool sibling_returned = false;
+
+            auto outer = [&]() -> task<>
+            {
+                auto result = co_await when_any(
+                    quitter_success_size(42),
+                    quitter_pending_size(sibling_returned));
+                // Variadic when_any returns
+                // variant<error_code, size_t, size_t>.
+                // Index 1 = first child won.
+                BOOST_TEST(result.index() == 1);
+                if(result.index() == 1)
+                    BOOST_TEST_EQ(std::get<1>(result),
+                        std::size_t(42));
+            };
+
+            run_async(pool.get_executor(),
+                [&]() { done.count_down(); },
+                [&](std::exception_ptr) {
+                    done.count_down();
+                })(outer());
+
+            done.wait();
+            BOOST_TEST(!sibling_returned);
+        }
+
+        // Both children pending.  Parent stop fires.
+        // when_any propagates stop to children.  Both
+        // quitter children short-circuit.
+        {
+            thread_pool pool(2);
+            std::latch done(1);
+            std::latch suspended(1);
+            std::stop_source source;
+
+            bool child1_returned = false;
+            bool child2_returned = false;
+
+            auto outer = [&]() -> task<>
+            {
+                suspended.count_down();
+                auto result = co_await when_any(
+                    quitter_pending_size(child1_returned),
+                    quitter_pending_size(child2_returned));
+                (void)result;
+            };
+
+            run_async(pool.get_executor(), source.get_token(),
+                [&]() { done.count_down(); },
+                [&](std::exception_ptr) {
+                    done.count_down();
+                })(outer());
+
+            suspended.wait();
+            std::this_thread::sleep_for(
+                std::chrono::milliseconds(10 * failsafe_scale));
+            source.request_stop();
+
+            done.wait();
+            BOOST_TEST(!child1_returned);
+            BOOST_TEST(!child2_returned);
+        }
+    }
+
+    //----------------------------------------------------------
+    // 14. Timer cancellation
+    //----------------------------------------------------------
+
+    void
+    testTimerCancellation()
+    {
+        using namespace std::chrono_literals;
+
+        thread_pool pool(1);
+        std::latch done(1);
+        std::latch suspended(1);
+        std::stop_source source;
+        bool reached_end = false;
+
+        auto q = [&]() -> quitter<>
+        {
+            suspended.count_down();
+            auto [ec] = co_await delay(10s);
+            (void)ec;
+            reached_end = true;
+        };
+
+        auto start = std::chrono::steady_clock::now();
+
+        run_async(pool.get_executor(), source.get_token(),
+            [&]() { done.count_down(); },
+            [&](std::exception_ptr) { done.count_down(); })(q());
+
+        suspended.wait();
+        std::this_thread::sleep_for(
+            std::chrono::milliseconds(10 * failsafe_scale));
+        source.request_stop();
+
+        done.wait();
+        auto elapsed = std::chrono::steady_clock::now() - start;
+        // Should complete promptly, well under 10s
+        BOOST_TEST(elapsed < 1s);
+        // Quitter intercepted the stop — body did not continue
+        BOOST_TEST(!reached_end);
+    }
+
+    //----------------------------------------------------------
+    // 13. Echo server with shutdown
+    //
+    //    A quitter echo loop over a mock stream pair.
+    //    The client exchanges data, then requests stop.
+    //    The echo quitter exits cleanly via stop interception,
+    //    RAII runs, and the echoed data was correct.
+    //
+    //    The mock stream's read_some is not stop-aware, so we
+    //    wrap each read in when_any with a stop_only_awaitable
+    //    to make it cancellable — mirroring how a real server
+    //    would have the OS cancel an in-flight read.
+    //----------------------------------------------------------
+
+    void
+    testEchoWithShutdown()
+    {
+        // Echo server over a mock stream pair.  The server
+        // reads pre-provided data, echoes it back, then
+        // waits for shutdown via stop_only_awaitable.  When
+        // stop fires, the quitter intercepts at the co_await
+        // and exits cleanly.  All stream access is on the
+        // jthread's synchronous executor — no cross-thread
+        // stream use.
+        test::fuse f;
+        auto [server_end, client_end] =
+            test::make_stream_pair(f);
+
+        client_end.provide("hello");
+
+        int dtor_count = 0;
+        bool reached_end = false;
+        std::size_t total_echoed = 0;
+        std::binary_semaphore suspended{0};
+
+        auto echo_server = [&]() -> quitter<>
+        {
+            raii_counter guard(dtor_count);
+            char buf[64];
+
+            // Echo loop: process all available data
+            auto [ec, n] = co_await server_end.read_some(
+                make_buffer(buf));
+            if(ec)
+                co_return;
+            total_echoed += n;
+            auto [ec2, n2] = co_await server_end.write_some(
+                make_buffer(buf, n));
+            if(ec2)
+                co_return;
+
+            // Signal that echo is done, then wait for
+            // shutdown.  stop_only_awaitable suspends until
+            // the stop token fires.
+            suspended.release();
+            co_await stop_only_awaitable{};
+
+            // Should never reach here — quitter intercepts
+            reached_end = true;
+        };
+
+        {
+            std::jthread jt(
+                [&](std::stop_token st)
+                {
+                    int dc = 0;
+                    test_executor ex(dc);
+                    run_async(ex, st)(echo_server());
+                }
+            );
+            suspended.acquire();
+
+            // Verify the echo happened
+            BOOST_TEST_EQ(total_echoed, std::size_t(5));
+
+            // Read back the echoed data from client side
+            // (synchronous — data is already in the buffer)
+            // Not possible here since we're on the main
+            // thread and streams are single-threaded.  The
+            // echo write went to client_end's read buffer
+            // which we can't access cross-thread.  The
+            // echo itself is verified by total_echoed.
+
+            // jthread destructor requests stop and joins
+        }
+
+        BOOST_TEST_EQ(dtor_count, 1);
+        BOOST_TEST(!reached_end);
+    }
+
+    //----------------------------------------------------------
+    // run()
+    //----------------------------------------------------------
+
+    void
+    run()
+    {
+        testNormalCompletion();
+        testVoidCompletion();
+        testExceptionPropagation();
+        testStopBeforeFirstAwait();
+        testStopDuringIO();
+        testStopPropagationChain();
+        testMixingQuitterAndTask();
+        testNoStopRequested();
+        testRAIIVerification();
+        testMultipleCoAwait();
+        testMoveOperations();
+        testReturnString();
+        testExceptionInValueQuitter();
+        testWhenAllWithStop();
+        testWhenAnyWithStop();
+        testTimerCancellation();
+        testEchoWithShutdown();
+    }
+};
+
+TEST_SUITE(
+    quitter_test,
+    "boost.capy.quitter");
+
+} // capy
+} // boost
diff --git a/test/unit/read.cpp b/test/unit/read.cpp
index 05095ab96..c73ba4f1f 100644
--- a/test/unit/read.cpp
+++ b/test/unit/read.cpp
@@ -10,7 +10,6 @@
 // Test that header file is self-contained.
 #include <boost/capy/read.hpp>
 
-#include <boost/capy/buffers/buffer_pair.hpp>
 #include <boost/capy/buffers/circular_dynamic_buffer.hpp>
 #include <boost/capy/buffers/make_buffer.hpp>
 #include <boost/capy/buffers/string_dynamic_buffer.hpp>
@@ -108,7 +107,7 @@ struct buffer_pair_factory
         std::memset(storage2, 0, sizeof(storage2));
     }
 
-    mutable_buffer_pair
+    std::array<mutable_buffer, 2>
     buffer()
     {
         return {{
@@ -320,12 +319,45 @@ struct read_test
         }));
     }
 
+    // Regression: capy#263. Free-function read() must take its buffer
+    // sequence by value so that storing the returned awaitable past
+    // the full-expression that created the sequence does not dangle.
+    void
+    testReadStoredAwaitableTemporarySequence()
+    {
+        BOOST_TEST(test::fuse().armed([](test::fuse& f) -> task<void>
+        {
+            test::read_stream rs(f);
+            rs.provide("helloworld");
+
+            char storage[10] = {};
+
+            // The std::array<mutable_buffer, 2> argument is a temporary
+            // that ends its lifetime at the end of this full-expression.
+            auto aw = read(rs, std::array<mutable_buffer, 2>{{
+                mutable_buffer(storage, 5),
+                mutable_buffer(storage + 5, 5)
+            }});
+
+            // If read() bound the sequence by const&, the awaitable now
+            // holds a dangling reference and the next line trips ASan
+            // (or silently reads stale stack).
+            auto [ec, n] = co_await std::move(aw);
+            if(ec)
+                co_return;
+
+            BOOST_TEST_EQ(n, 10u);
+            BOOST_TEST_EQ(std::string_view(storage, 10), "helloworld");
+        }));
+    }
+
     void
     testReadStream()
     {
         testReadSingleBuffer();
         testReadBufferArray();
         testReadBufferPair();
+        testReadStoredAwaitableTemporarySequence();
     }
 
     //----------------------------------------------------------
diff --git a/test/unit/task.cpp b/test/unit/task.cpp
index b3d446c6f..65ca6a01f 100644
--- a/test/unit/task.cpp
+++ b/test/unit/task.cpp
@@ -19,6 +19,7 @@
 
 #include <atomic>
 #include <chrono>
+#include <semaphore>
 #include <thread>
 #include <queue>
 #include <stdexcept>
@@ -74,17 +75,17 @@ struct tracking_executor
     void on_work_started() const noexcept {}
     void on_work_finished() const noexcept {}
 
-    std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const
+    std::coroutine_handle<> dispatch(continuation& c) const
     {
         ++(*dispatch_count_);
         if (dispatch_log)
             dispatch_log->push_back(id);
-        return h;
+        return c.h;
     }
 
-    void post(std::coroutine_handle<> h) const
+    void post(continuation& c) const
     {
-        h.resume();
+        c.h.resume();
     }
 };
 
@@ -1220,6 +1221,7 @@ struct task_test
     {
 
         std::atomic<int> state = 0;
+        std::binary_semaphore suspended{0};
 
         auto l = [&]() -> capy::task<void>
         {
@@ -1230,6 +1232,7 @@ struct task_test
             } sc{state};
 
             state = 1;
+            suspended.release();
             co_await stop_only_awaitable{};
             state = 2;
         };
@@ -1244,7 +1247,7 @@ struct task_test
                     run_async(ex, st)(l());
                 }
             );
-            std::this_thread::sleep_for(std::chrono::milliseconds(50));
+            suspended.acquire();
             BOOST_TEST(state == 1);
         }
 
diff --git a/test/unit/test/bufgrind.cpp b/test/unit/test/bufgrind.cpp
index a3e647ba1..64ed44d43 100644
--- a/test/unit/test/bufgrind.cpp
+++ b/test/unit/test/bufgrind.cpp
@@ -11,6 +11,7 @@
 #include <boost/capy/test/bufgrind.hpp>
 
 #include <boost/capy/buffers/make_buffer.hpp>
+#include <boost/capy/concept/slice.hpp>
 #include <boost/capy/task.hpp>
 #include <boost/capy/test/buffer_to_string.hpp>
 #include <boost/capy/test/fuse.hpp>
@@ -58,8 +59,8 @@ class bufgrind_test
             int count = 0;
             while(bg) {
                 auto [b1, b2] = co_await bg.next();
-                BOOST_TEST_EQ(buffer_size(b1), 0u);
-                BOOST_TEST_EQ(buffer_size(b2), 0u);
+                BOOST_TEST_EQ(buffer_size(b1.data()), 0u);
+                BOOST_TEST_EQ(buffer_size(b2.data()), 0u);
                 ++count;
             }
             BOOST_TEST_EQ(count, 1);
@@ -80,11 +81,11 @@ class bufgrind_test
             while(bg) {
                 auto [b1, b2] = co_await bg.next();
                 if(count == 0) {
-                    BOOST_TEST_EQ(buffer_size(b1), 0u);
-                    BOOST_TEST_EQ(buffer_size(b2), 1u);
+                    BOOST_TEST_EQ(buffer_size(b1.data()), 0u);
+                    BOOST_TEST_EQ(buffer_size(b2.data()), 1u);
                 } else if(count == 1) {
-                    BOOST_TEST_EQ(buffer_size(b1), 1u);
-                    BOOST_TEST_EQ(buffer_size(b2), 0u);
+                    BOOST_TEST_EQ(buffer_size(b1.data()), 1u);
+                    BOOST_TEST_EQ(buffer_size(b2.data()), 0u);
                 }
                 ++count;
             }
@@ -107,9 +108,9 @@ class bufgrind_test
             while(bg) {
                 auto [b1, b2] = co_await bg.next();
 
-                BOOST_TEST_EQ(buffer_to_string(b1, b2), data);
-                BOOST_TEST_EQ(b1.size(), static_cast<std::size_t>(count));
-                BOOST_TEST_EQ(b2.size(), data.size() - count);
+                BOOST_TEST_EQ(buffer_to_string(b1.data(), b2.data()), data);
+                BOOST_TEST_EQ(buffer_size(b1.data()), static_cast<std::size_t>(count));
+                BOOST_TEST_EQ(buffer_size(b2.data()), data.size() - count);
                 ++count;
             }
             BOOST_TEST_EQ(count, 6);
@@ -129,7 +130,7 @@ class bufgrind_test
             std::vector<std::size_t> positions;
             while(bg) {
                 auto [b1, b2] = co_await bg.next();
-                positions.push_back(buffer_size(b1));
+                positions.push_back(buffer_size(b1.data()));
             }
 
             // Expect: 0, 3, 6, 9, 10 (always includes final position)
@@ -156,7 +157,7 @@ class bufgrind_test
             std::vector<std::size_t> positions;
             while(bg) {
                 auto [b1, b2] = co_await bg.next();
-                positions.push_back(buffer_size(b1));
+                positions.push_back(buffer_size(b1.data()));
             }
 
             // Expect: 0, 2, 4, 6
@@ -200,7 +201,7 @@ class bufgrind_test
             std::vector<std::size_t> positions;
             while(bg) {
                 auto [b1, b2] = co_await bg.next();
-                positions.push_back(buffer_size(b1));
+                positions.push_back(buffer_size(b1.data()));
             }
 
             // Expect: 0, 3 (clamped to size, then final)
@@ -224,11 +225,10 @@ class bufgrind_test
             while(bg) {
                 auto [b1, b2] = co_await bg.next();
 
-                // slice_type<mutable_buffer> is mutable_buffer
-                // Verify sizes are correct and types are mutable
-                static_assert(std::is_same_v<decltype(b1), mutable_buffer>);
-                static_assert(std::is_same_v<decltype(b2), mutable_buffer>);
-                BOOST_TEST_EQ(b1.size() + b2.size(), 5u);
+                // Slices over a mutable input model MutableSlice
+                static_assert(MutableSlice<decltype(b1)>);
+                static_assert(MutableSlice<decltype(b2)>);
+                BOOST_TEST_EQ(buffer_size(b1.data()) + buffer_size(b2.data()), 5u);
             }
         });
         BOOST_TEST(r.success);
@@ -247,11 +247,13 @@ class bufgrind_test
             while(bg) {
                 auto [b1, b2] = co_await bg.next();
 
-                // slice_type<const_buffer> is const_buffer
-                // Verify sizes are correct and types are const
-                static_assert(std::is_same_v<decltype(b1), const_buffer>);
-                static_assert(std::is_same_v<decltype(b2), const_buffer>);
-                BOOST_TEST_EQ(b1.size() + b2.size(), 5u);
+                // Slices over a const-only input model Slice but not
+                // MutableSlice.
+                static_assert(Slice<decltype(b1)>);
+                static_assert(!MutableSlice<decltype(b1)>);
+                static_assert(Slice<decltype(b2)>);
+                static_assert(!MutableSlice<decltype(b2)>);
+                BOOST_TEST_EQ(buffer_size(b1.data()) + buffer_size(b2.data()), 5u);
             }
         });
         BOOST_TEST(r.success);
@@ -276,7 +278,7 @@ class bufgrind_test
                 auto [b1, b2] = co_await bg.next();
 
                 // Verify concatenation reconstructs original
-                BOOST_TEST_EQ(buffer_to_string(b1, b2), "abcdef");
+                BOOST_TEST_EQ(buffer_to_string(b1.data(), b2.data()), "abcdef");
                 ++count;
             }
             BOOST_TEST_EQ(count, 7);
@@ -299,17 +301,15 @@ class bufgrind_test
 
                 // Set up read_stream with data matching b1 size
                 read_stream rs(f);
-                rs.provide(std::string_view(
-                    static_cast<char const*>(b1.data()),
-                    b1.size()));
+                rs.provide(buffer_to_string(b1.data()));
 
                 // Read into a destination buffer
-                if(b1.size() > 0) {
+                if(buffer_size(b1.data()) > 0) {
                     std::string dest;
-                    dest.resize(b1.size());
+                    dest.resize(buffer_size(b1.data()));
                     auto [ec, n] = co_await rs.read_some(make_buffer(dest));
                     BOOST_TEST(! ec);
-                    BOOST_TEST_EQ(n, b1.size());
+                    BOOST_TEST_EQ(n, buffer_size(b1.data()));
                 }
             }
         });
@@ -332,20 +332,20 @@ class bufgrind_test
                 // Write b1 then b2 to stream
                 write_stream ws(f);
 
-                if(b1.size() > 0) {
-                    auto [ec1, n1] = co_await ws.write_some(b1);
+                if(buffer_size(b1.data()) > 0) {
+                    auto [ec1, n1] = co_await ws.write_some(b1.data());
                     BOOST_TEST(! ec1);
-                    BOOST_TEST_EQ(n1, b1.size());
+                    BOOST_TEST_EQ(n1, buffer_size(b1.data()));
                 }
 
-                if(b2.size() > 0) {
-                    auto [ec2, n2] = co_await ws.write_some(b2);
+                if(buffer_size(b2.data()) > 0) {
+                    auto [ec2, n2] = co_await ws.write_some(b2.data());
                     BOOST_TEST(! ec2);
-                    BOOST_TEST_EQ(n2, b2.size());
+                    BOOST_TEST_EQ(n2, buffer_size(b2.data()));
                 }
 
                 // Verify total written equals original
-                BOOST_TEST_EQ(ws.data(), buffer_to_string(b1, b2));
+                BOOST_TEST_EQ(ws.data(), buffer_to_string(b1.data(), b2.data()));
             }
         });
         BOOST_TEST(r.success);
@@ -369,16 +369,16 @@ class bufgrind_test
 
                     // Write both parts through stream
                     write_stream ws(f);
-                    if(b1.size() > 0) {
-                        auto [ec, n] = co_await ws.write_some(b1);
+                    if(buffer_size(b1.data()) > 0) {
+                        auto [ec, n] = co_await ws.write_some(b1.data());
                         BOOST_TEST(! ec);
                     }
-                    if(b2.size() > 0) {
-                        auto [ec, n] = co_await ws.write_some(b2);
+                    if(buffer_size(b2.data()) > 0) {
+                        auto [ec, n] = co_await ws.write_some(b2.data());
                         BOOST_TEST(! ec);
                     }
 
-                    BOOST_TEST_EQ(ws.data(), buffer_to_string(b1, b2));
+                    BOOST_TEST_EQ(ws.data(), buffer_to_string(b1.data(), b2.data()));
                     BOOST_TEST_EQ(ws.data(), original);
                 }
             }
diff --git a/test/unit/test/stream.cpp b/test/unit/test/stream.cpp
index 294594210..d2e666a1a 100644
--- a/test/unit/test/stream.cpp
+++ b/test/unit/test/stream.cpp
@@ -15,6 +15,7 @@
 #include <boost/capy/concept/stream.hpp>
 #include <boost/capy/concept/write_stream.hpp>
 #include <boost/capy/cond.hpp>
+#include <boost/capy/io_task.hpp>
 #include <boost/capy/task.hpp>
 #include <boost/capy/test/run_blocking.hpp>
 #include <boost/capy/when_all.hpp>
@@ -736,24 +737,26 @@ class stream_pair_test
         auto r = f.armed([&](fuse&) -> task<> {
             auto [a, b] = make_stream_pair(f);
 
-            co_await when_all(
-                [](stream a) -> task<> {
+            (void) co_await when_all(
+                [](stream a) -> io_task<> {
                     char buf[32] = {};
                     auto [ec, n] = co_await a.read_some(
                         make_buffer(buf));
                     if(ec)
-                        co_return;
+                        co_return io_result<>{ec};
                     BOOST_TEST_EQ(n, 5u);
                     BOOST_TEST_EQ(
                         std::string_view(buf, n),
                         "hello");
+                    co_return io_result<>{};
                 }(std::move(a)),
-                [](stream b) -> task<> {
+                [](stream b) -> io_task<> {
                     auto [ec, n] = co_await b.write_some(
                         make_buffer("hello", 5));
                     if(ec)
-                        co_return;
+                        co_return io_result<>{ec};
                     BOOST_TEST_EQ(n, 5u);
+                    co_return io_result<>{};
                 }(std::move(b))
             );
         });
@@ -953,24 +956,26 @@ class stream_pair_test
         auto r = f.armed([&](fuse&) -> task<> {
             auto [a, b] = make_stream_pair(f);
 
-            co_await when_all(
-                [](stream a) -> task<> {
+            (void) co_await when_all(
+                [](stream a) -> io_task<> {
                     char buf[3] = {};
                     auto [ec, n] = co_await a.read_some(
                         make_buffer(buf));
                     if(ec)
-                        co_return;
+                        co_return io_result<>{ec};
                     BOOST_TEST_EQ(n, 3u);
                     BOOST_TEST_EQ(
                         std::string_view(buf, n),
                         "hel");
+                    co_return io_result<>{};
                 }(std::move(a)),
-                [](stream b) -> task<> {
+                [](stream b) -> io_task<> {
                     auto [ec, n] = co_await b.write_some(
                         make_buffer("hello", 5));
                     if(ec)
-                        co_return;
+                        co_return io_result<>{ec};
                     BOOST_TEST_EQ(n, 5u);
+                    co_return io_result<>{};
                 }(std::move(b))
             );
         });
@@ -1042,17 +1047,18 @@ class stream_pair_test
         run_blocking()([&]() -> task<> {
             auto [a, b] = make_stream_pair(f);
 
-            co_await when_all(
-                [](stream a) -> task<> {
+            (void) co_await when_all(
+                [](stream a) -> io_task<> {
                     char buf[32] = {};
                     auto [ec, n] = co_await a.read_some(
                         make_buffer(buf));
                     BOOST_TEST(ec == cond::eof);
                     BOOST_TEST_EQ(n, 0u);
+                    co_return io_result<>{ec};
                 }(std::move(a)),
-                [](stream b) -> task<> {
+                [](stream b) -> io_task<> {
                     b.close();
-                    co_return;
+                    co_return io_result<>{};
                 }(std::move(b))
             );
         }());
@@ -1099,8 +1105,8 @@ class stream_pair_test
         auto r = f.armed([&](fuse&) -> task<> {
             auto [a, b] = make_stream_pair(f);
 
-            co_await when_all(
-                [](stream a) -> task<> {
+            (void) co_await when_all(
+                [](stream a) -> io_task<> {
                     // Reader suspends waiting for data.
                     // Gets data, eof from peer's guard,
                     // or its own fuse error on resume.
@@ -1108,17 +1114,19 @@ class stream_pair_test
                     auto [ec, n] = co_await a.read_some(
                         make_buffer(buf));
                     if(ec)
-                        co_return;
+                        co_return io_result<>{ec};
                     BOOST_TEST_EQ(n, 5u);
+                    co_return io_result<>{};
                 }(std::move(a)),
-                [](stream b) -> task<> {
+                [](stream b) -> io_task<> {
                     // Writer may get fuse error, which
                     // closes the peer via the guard
                     auto [ec, n] = co_await b.write_some(
                         make_buffer("hello", 5));
                     if(ec)
-                        co_return;
+                        co_return io_result<>{ec};
                     BOOST_TEST_EQ(n, 5u);
+                    co_return io_result<>{};
                 }(std::move(b))
             );
         });
diff --git a/test/unit/test_dynamic_buffer.hpp b/test/unit/test_dynamic_buffer.hpp
index c6779900e..79c58bea0 100644
--- a/test/unit/test_dynamic_buffer.hpp
+++ b/test/unit/test_dynamic_buffer.hpp
@@ -21,7 +21,6 @@
 #include "test_suite.hpp"
 
 #include <string>
-#include <string_view>
 
 namespace boost {
 namespace capy {
@@ -51,25 +50,24 @@ grind_dynamic_buffer(F&& make_buffer_fn)
         while(bg)
         {
             auto [b1, b2] = co_await bg.next();
-            BOOST_TEST_EQ(buffer_to_string(b1, b2), data);
+            BOOST_TEST_EQ(buffer_to_string(b1.data(), b2.data()), data);
 
             auto db = make_buffer_fn();
 
             // Read b1 into dynamic buffer via read_stream
             read_stream rs(f);
-            rs.provide(std::string_view(
-                static_cast<char const*>(b1.data()), b1.size()));
+            rs.provide(buffer_to_string(b1.data()));
 
-            if(buffer_size(b1) > 0)
+            if(buffer_size(b1.data()) > 0)
             {
-                auto mb = db.prepare(buffer_size(b1));
+                auto mb = db.prepare(buffer_size(b1.data()));
                 auto [ec, n] = co_await rs.read_some(mb);
                 if(ec)
                     co_return;
                 db.commit(n);
             }
 
-            BOOST_TEST_EQ(db.size(), buffer_size(b1));
+            BOOST_TEST_EQ(db.size(), buffer_size(b1.data()));
 
             // Write from dynamic buffer to write_stream
             write_stream ws(f);
@@ -82,7 +80,7 @@ grind_dynamic_buffer(F&& make_buffer_fn)
             }
 
             // Verify round-trip
-            BOOST_TEST_EQ(ws.data(), buffer_to_string(b1));
+            BOOST_TEST_EQ(ws.data(), buffer_to_string(b1.data()));
 
             db.consume(db.size());
             BOOST_TEST_EQ(db.size(), 0u);
diff --git a/test/unit/test_helpers.hpp b/test/unit/test_helpers.hpp
index c1436f719..08f421f5c 100644
--- a/test/unit/test_helpers.hpp
+++ b/test/unit/test_helpers.hpp
@@ -18,7 +18,7 @@
 #include <boost/capy/ex/io_env.hpp>
 #include <boost/capy/task.hpp>
 
-#include <optional>
+#include <atomic>
 #include <stop_token>
 
 #include "test_suite.hpp"
@@ -30,6 +30,14 @@
 #include <string>
 #include <thread>
 
+// Valgrind slows execution ~10-20x; scale timing-sensitive
+// durations to avoid false failures.
+#ifdef BOOST_NO_STRESS_TEST
+inline constexpr int failsafe_scale = 20;
+#else
+inline constexpr int failsafe_scale = 1;
+#endif
+
 #if defined(__linux__) || defined(__FreeBSD__) || defined(__NetBSD__) || defined(__APPLE__)
 #include <pthread.h>
 #define BOOST_CAPY_TEST_CAN_GET_THREAD_NAME 1
@@ -100,17 +108,17 @@ struct test_executor
     void on_work_finished() const noexcept {}
 
     std::coroutine_handle<>
-    dispatch(std::coroutine_handle<> h) const
+    dispatch(continuation& c) const
     {
         if(dispatch_count_)
             ++(*dispatch_count_);
-        return h;
+        return c.h;
     }
 
     void
-    post(std::coroutine_handle<> h) const
+    post(continuation& c) const
     {
-        h.resume();
+        c.h.resume();
     }
 };
 
@@ -248,13 +256,60 @@ struct self_destroy_awaitable
 };
 
 
-// test awaitable that must be stopped in order to resume
+// Callable that posts a continuation to an executor instead of
+// resuming a coroutine handle inline.  Use as the callback type
+// for std::stop_callback — direct resumption runs the coroutine
+// on whatever thread calls request_stop(), bypassing the executor.
+struct resume_via_post
+{
+    executor_ref ex;
+    mutable continuation cont;
+
+    void operator()() const noexcept
+    {
+        ex.post(cont);
+    }
+};
+
+using stop_resume_callback = std::stop_callback<resume_via_post>;
+
+inline resume_via_post
+post_resume(
+    io_env const& env,
+    std::coroutine_handle<> h) noexcept
+{
+    return resume_via_post{env.executor, continuation{h}};
+}
+
+// test awaitable that must be stopped in order to resume.
+// Uses resume_via_post to ensure the coroutine resumes on the
+// executor's thread, not on whatever thread calls request_stop().
 struct stop_only_awaitable
 {
     stop_only_awaitable() noexcept = default;
     stop_only_awaitable(stop_only_awaitable && ) noexcept {}
 
-    std::optional<std::stop_callback<std::coroutine_handle<>>> stop_cb;
+    // Placement-new storage instead of std::optional to avoid a
+    // data race on optional's _M_engaged flag.  The stop_callback
+    // constructor synchronises with request_stop() through the
+    // stop-state's atomics, but optional::emplace writes _M_engaged
+    // *after* the constructor returns — outside that sync window.
+    // When ~jthread calls request_stop() before join(), the
+    // destructor's _M_reset (on the requesting thread) races with
+    // emplace's _M_engaged write (on the registering thread).
+    BOOST_CAPY_MSVC_WARNING_PUSH
+    BOOST_CAPY_MSVC_WARNING_DISABLE(4324) // padded due to alignas
+    alignas(stop_resume_callback)
+        unsigned char stop_cb_buf_[sizeof(stop_resume_callback)]{};
+    BOOST_CAPY_MSVC_WARNING_POP
+    std::atomic<bool> active_{false};
+
+    ~stop_only_awaitable()
+    {
+        if (active_.load(std::memory_order_acquire))
+            reinterpret_cast<stop_resume_callback*>(
+                stop_cb_buf_)->~stop_resume_callback();
+    }
 
     bool await_ready() {return false;}
 
@@ -262,7 +317,9 @@ struct stop_only_awaitable
     {
         if (env->stop_token.stop_requested())
             return h;
-        stop_cb.emplace(env->stop_token, h);
+        ::new(stop_cb_buf_) stop_resume_callback(
+            env->stop_token, post_resume(*env, h));
+        active_.store(true, std::memory_order_release);
         return std::noop_coroutine();
     }
     void await_resume() {}
@@ -362,15 +419,15 @@ struct queuing_executor
     void on_work_started() const noexcept {}
     void on_work_finished() const noexcept {}
 
-    std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const
+    std::coroutine_handle<> dispatch(continuation& c) const
     {
-        queue_->push(h);
+        queue_->push(c.h);
         return std::noop_coroutine();
     }
 
-    void post(std::coroutine_handle<> h) const
+    void post(continuation& c) const
     {
-        queue_->push(h);
+        queue_->push(c.h);
     }
 };
 
@@ -388,6 +445,8 @@ static_assert(Executor<queuing_executor>);
 */
 struct yield_awaitable
 {
+    continuation cont_;
+
     bool await_ready() const noexcept
     {
         return false;
@@ -396,7 +455,8 @@ struct yield_awaitable
     std::coroutine_handle<> await_suspend(std::coroutine_handle<> h, io_env const* env)
     {
         // Post ourselves back to the queue
-        env->executor.post(h);
+        cont_.h = h;
+        env->executor.post(cont_);
         return std::noop_coroutine();
     }
 
diff --git a/test/unit/timeout.cpp b/test/unit/timeout.cpp
new file mode 100644
index 000000000..6a906b38a
--- /dev/null
+++ b/test/unit/timeout.cpp
@@ -0,0 +1,305 @@
+//
+// Copyright (c) 2026 Michael Vandeberg
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+//
+// Official repository: https://github.com/cppalliance/capy
+//
+
+// Test that header file is self-contained.
+#include <boost/capy/timeout.hpp>
+
+#include <boost/capy/cond.hpp>
+#include <boost/capy/error.hpp>
+#include <boost/capy/ex/run_async.hpp>
+#include <boost/capy/ex/thread_pool.hpp>
+#include <boost/capy/io_task.hpp>
+
+#include "test_helpers.hpp"
+#include "test_suite.hpp"
+
+#include <latch>
+#include <string>
+
+namespace boost {
+namespace capy {
+
+using namespace std::chrono_literals;
+
+//----------------------------------------------------------
+// Helper tasks for timeout testing
+//----------------------------------------------------------
+
+// Returns an io_result<int> immediately
+inline io_task<int>
+returns_io_int(int value)
+{
+    co_return io_result<int>{{}, value};
+}
+
+// Returns an io_result<std::string> immediately
+inline io_task<std::string>
+returns_io_string(std::string value)
+{
+    co_return io_result<std::string>{{}, std::move(value)};
+}
+
+// Returns io_result<> immediately (void equivalent)
+inline io_task<>
+returns_io_void()
+{
+    co_return io_result<>{};
+}
+
+// Returns io_result<std::size_t> after stop is requested
+inline io_task<std::size_t>
+slow_io_result(std::size_t n)
+{
+    co_await stop_only_awaitable{};
+    co_return io_result<std::size_t>{{}, n};
+}
+
+// Returns io_result<int> after stop is requested
+inline io_task<int>
+slow_io_int(int value)
+{
+    co_await stop_only_awaitable{};
+    co_return io_result<int>{{}, value};
+}
+
+// Returns io_result<> after stop is requested
+inline io_task<>
+slow_io_void()
+{
+    co_await stop_only_awaitable{};
+    co_return io_result<>{};
+}
+
+// io_task that throws an exception immediately
+inline io_task<int>
+io_immediate_throw(char const* msg)
+{
+    throw test_exception(msg);
+    co_return io_result<int>{{}, 0};
+}
+
+//----------------------------------------------------------
+// Tests
+//----------------------------------------------------------
+
+struct timeout_test
+{
+    // Test: io_result<int> completes before timeout
+    void
+    testTaskCompletesBeforeTimeout()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        io_result<int> result{};
+
+        run_async(pool.get_executor(),
+            [&](io_result<int> r) {
+                result = r;
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(timeout(returns_io_int(42), 5s));
+
+        done.wait();
+        BOOST_TEST(!result.ec);
+        BOOST_TEST_EQ(std::get<0>(result.values), 42);
+    }
+
+    // Test: io_result<string> completes before timeout
+    void
+    testTaskCompletesWithString()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        io_result<std::string> result{};
+
+        run_async(pool.get_executor(),
+            [&](io_result<std::string> r) {
+                result = std::move(r);
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(timeout(returns_io_string("hello"), 5s));
+
+        done.wait();
+        BOOST_TEST(!result.ec);
+        BOOST_TEST_EQ(std::get<0>(result.values), "hello");
+    }
+
+    // Test: io_result<> completes before timeout
+    void
+    testVoidTaskCompletes()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        io_result<> result{make_error_code(error::timeout)};
+
+        run_async(pool.get_executor(),
+            [&](io_result<> r) {
+                result = r;
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(timeout(returns_io_void(), 5s));
+
+        done.wait();
+        BOOST_TEST(!result.ec);
+    }
+
+    // Test: Timeout fires - io_result<size_t> path returns error::timeout
+    void
+    testTimeoutIoResult()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        std::error_code ec;
+        std::size_t n = 999;
+
+        run_async(pool.get_executor(),
+            [&](io_result<std::size_t> r) {
+                ec = r.ec;
+                n = std::get<0>(r.values);
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(timeout(slow_io_result(100), 1ms));
+
+        done.wait();
+        BOOST_TEST(ec == error::timeout);
+        BOOST_TEST(ec == cond::timeout);
+        BOOST_TEST_EQ(n, 0u);
+    }
+
+    // Test: Timeout fires - io_result<int> reports error::timeout
+    void
+    testTimeoutReportsErrorForInt()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        std::error_code ec;
+
+        run_async(pool.get_executor(),
+            [&](io_result<int> r) {
+                ec = r.ec;
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(timeout(slow_io_int(42), 1ms));
+
+        done.wait();
+        BOOST_TEST(ec == error::timeout);
+    }
+
+    // Test: Timeout fires - io_result<> reports error::timeout
+    void
+    testTimeoutReportsErrorForVoid()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        std::error_code ec;
+
+        run_async(pool.get_executor(),
+            [&](io_result<> r) {
+                ec = r.ec;
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(timeout(slow_io_void(), 1ms));
+
+        done.wait();
+        BOOST_TEST(ec == error::timeout);
+    }
+
+    // Test: Zero duration times out immediately
+    void
+    testZeroDuration()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        std::error_code ec;
+
+        run_async(pool.get_executor(),
+            [&](io_result<int> r) {
+                ec = r.ec;
+                done.count_down();
+            },
+            [&](std::exception_ptr) {
+                done.count_down();
+            })(timeout(slow_io_int(42), 0ms));
+
+        done.wait();
+        BOOST_TEST(ec == error::timeout);
+    }
+
+    // Test: cond::timeout equivalence
+    void
+    testCondEquivalence()
+    {
+        auto ec = make_error_code(error::timeout);
+        BOOST_TEST(ec == cond::timeout);
+        BOOST_TEST(!(ec == cond::canceled));
+        BOOST_TEST(!(ec == cond::eof));
+
+        auto cond_ec = make_error_condition(cond::timeout);
+        BOOST_TEST(cond_ec.message() == "operation timed out");
+    }
+
+    // Inner task throws before delay fires.
+    // Exception propagates to caller, not swallowed by timer.
+    void
+    testThrowPropagatesBeforeTimeout()
+    {
+        thread_pool pool(1);
+        std::latch done(1);
+        bool caught = false;
+        std::string msg;
+
+        run_async(pool.get_executor(),
+            [&](io_result<int>) {
+                done.count_down();
+            },
+            [&](std::exception_ptr ep) {
+                try { std::rethrow_exception(ep); }
+                catch (test_exception const& e) {
+                    caught = true;
+                    msg = e.what();
+                }
+                done.count_down();
+            })(timeout(io_immediate_throw("boom"), 5s));
+
+        done.wait();
+        BOOST_TEST(caught);
+        BOOST_TEST_EQ(msg, "boom");
+    }
+
+    void
+    run()
+    {
+        testTaskCompletesBeforeTimeout();
+        testTaskCompletesWithString();
+        testVoidTaskCompletes();
+        testTimeoutIoResult();
+        testTimeoutReportsErrorForInt();
+        testTimeoutReportsErrorForVoid();
+        testZeroDuration();
+        testCondEquivalence();
+        testThrowPropagatesBeforeTimeout();
+    }
+};
+
+TEST_SUITE(timeout_test, "capy.timeout");
+
+} // capy
+} // boost
diff --git a/test/unit/when_all.cpp b/test/unit/when_all.cpp
index a7b22614e..d27d829cc 100644
--- a/test/unit/when_all.cpp
+++ b/test/unit/when_all.cpp
@@ -10,23 +10,27 @@
 // Test that header file is self-contained.
 #include <boost/capy/when_all.hpp>
 
-#include <boost/capy/ex/async_event.hpp>
+#include <boost/capy/cond.hpp>
+#include <boost/capy/error.hpp>
 #include <boost/capy/ex/run_async.hpp>
+#include <boost/capy/ex/strand.hpp>
+#include <boost/capy/ex/thread_pool.hpp>
 #include <boost/capy/io_task.hpp>
 #include <boost/capy/task.hpp>
 
 #include "test_helpers.hpp"
 
 #include <atomic>
+#include <latch>
 #include <stdexcept>
 #include <string>
+#include <system_error>
+#include <tuple>
 #include <vector>
 
-// GCC-11 gives false positive -Wmaybe-uninitialized warnings when run_async.hpp's
-// await_suspend is inlined into lambdas. The warnings occur because GCC's flow
-// analysis can't see through the coroutine machinery to verify that result_ is
-// initialized before use. Suppress these false positives for this entire file.
-#if defined(__GNUC__) && !defined(__clang__) && __GNUC__ == 11
+// GCC gives false positive -Wmaybe-uninitialized on structured bindings
+// via the tuple protocol inside coroutine frames.
+#if defined(__GNUC__) && !defined(__clang__)
 #pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
 #endif
 #include <type_traits>
@@ -34,1018 +38,1149 @@
 namespace boost {
 namespace capy {
 
-// Static assertions for void filtering type trait
-static_assert(std::is_same_v<
-    detail::filter_void_tuple_t<int>,
-    std::tuple<int>>);
-static_assert(std::is_same_v<
-    detail::filter_void_tuple_t<void>,
-    std::tuple<>>);
-static_assert(std::is_same_v<
-    detail::filter_void_tuple_t<int, void, std::string>,
-    std::tuple<int, std::string>>);
-static_assert(std::is_same_v<
-    detail::filter_void_tuple_t<void, void, void>,
-    std::tuple<>>);
-
-// Verify result_type: void when all tasks are void, tuple otherwise
-static_assert(std::is_same_v<
-    when_all_result_type<int, std::string>,
-    std::tuple<int, std::string>>);
-static_assert(std::is_same_v<
-    when_all_result_type<int, void, std::string>,
-    std::tuple<int, std::string>>);
-static_assert(std::is_void_v<
-    when_all_result_type<void>>);
-static_assert(std::is_void_v<
-    when_all_result_type<void, void>>);
-static_assert(std::is_void_v<
-    when_all_result_type<void, void, void>>);
-
 // Verify when_all returns task which satisfies awaitable protocols
-static_assert(IoAwaitable<task<std::tuple<int, int>>>);
-
-// Verify non-task IoAwaitables work with when_all
-template<typename... Args>
-concept WhenAllCallable = requires(Args... args) {
-    when_all(std::move(args)...);
-};
+static_assert(IoAwaitable<task<io_result<size_t, size_t>>>);
 
-static_assert(WhenAllCallable<stop_only_awaitable>);
-static_assert(WhenAllCallable<stop_only_awaitable, stop_only_awaitable>);
-static_assert(WhenAllCallable<async_event::wait_awaiter>);
-static_assert(WhenAllCallable<async_event::wait_awaiter, stop_only_awaitable>);
-static_assert(WhenAllCallable<task<int>, stop_only_awaitable>);
-
-struct when_all_test
+struct when_all_strand_test
 {
-
-    // Test: Single task with when_all succeeds
+    // Regression for #131: executor_ref::dispatch() formerly
+    // returned void, discarding the symmetric transfer handle
+    // from strand::dispatch(). This caused when_all child
+    // runners to never resume, deadlocking the caller.
     void
-    testAllSucceed()
+    testStrandWhenAll()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        thread_pool pool(2);
+        strand s{pool.get_executor()};
+        std::latch done(1);
         bool completed = false;
-        int result = 0;
 
-        run_async(ex,
-            [&](std::tuple<int> t) {
-                auto [v] = t;
+        auto outer = [&]() -> task<io_result<std::tuple<>, std::tuple<>>> {
+            co_return co_await when_all(
+                []() -> io_task<> { co_return io_result<>{{}}; }(),
+                []() -> io_task<> { co_return io_result<>{{}}; }()
+            );
+        };
+
+        run_async(s,
+            [&](auto&&...) {
                 completed = true;
-                result = v;
+                done.count_down();
             },
-            [](std::exception_ptr) {})(when_all(returns_int(42)));
+            [&](auto) {
+                done.count_down();
+            }
+        )(outer());
 
+        done.wait();
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(result, 42);
     }
 
-    // Test: Three tasks succeed
+    // Verify strand + when_all propagates values correctly
     void
-    testThreeTasksSucceed()
+    testStrandWhenAllWithValues()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        thread_pool pool(2);
+        strand s{pool.get_executor()};
+        std::latch done(1);
         bool completed = false;
-        int result = 0;
+        size_t result = 0;
+
+        auto outer = [&]() -> task<io_result<size_t, size_t>> {
+            co_return co_await when_all(
+                []() -> io_task<size_t> {
+                    co_return io_result<size_t>{{}, 10};
+                }(),
+                []() -> io_task<size_t> {
+                    co_return io_result<size_t>{{}, 20};
+                }());
+        };
 
-        run_async(ex,
-            [&](std::tuple<int, int, int> t) {
-                auto [a, b, c] = t;
+        run_async(s,
+            [&](io_result<size_t, size_t> r) {
                 completed = true;
-                result = a + b + c;
+                result = std::get<0>(r.values) + std::get<1>(r.values);
+                done.count_down();
             },
-            [](std::exception_ptr) {})(
-            when_all(returns_int(1), returns_int(2), returns_int(3)));
+            [&](auto) {
+                done.count_down();
+            }
+        )(outer());
 
+        done.wait();
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(result, 6);
+        BOOST_TEST_EQ(result, 30u);
     }
 
-    // Test: Mixed types (int, string, void)
     void
-    testMixedTypes()
+    run()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        testStrandWhenAll();
+        testStrandWhenAllWithValues();
+    }
+};
+
+TEST_SUITE(
+    when_all_strand_test,
+    "boost.capy.when_all_strand");
+
+// Verify IoAwaitableRange concept
+static_assert(IoAwaitableRange<std::vector<io_task<size_t>>>);
+static_assert(IoAwaitableRange<std::vector<io_task<>>>);
+
+// io_task helpers for io_result-aware spec tests
+namespace {
+
+io_task<size_t>
+io_success_size(size_t n)
+{
+    co_return io_result<size_t>{{}, n};
+}
+
+io_task<size_t>
+io_error_size(std::error_code ec, size_t n = 0)
+{
+    co_return io_result<size_t>{ec, n};
+}
+
+io_task<>
+io_void_ok()
+{
+    co_return io_result<>{};
+}
+
+io_task<>
+io_void_error(std::error_code ec)
+{
+    co_return io_result<>{ec};
+}
+
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4702) // unreachable code after throw
+#endif
+
+io_task<>
+io_void_throws(char const* msg)
+{
+    throw test_exception(msg);
+    co_return io_result<>{};
+}
+
+io_task<size_t>
+io_throws_size(char const* msg)
+{
+    throw test_exception(msg);
+    co_return io_result<size_t>{{}, 0};
+}
+
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+
+io_task<std::string>
+io_success_string(std::string s)
+{
+    co_return io_result<std::string>{{}, std::move(s)};
+}
+
+io_task<size_t, int>
+io_success_size_int(size_t n, int flags)
+{
+    co_return io_result<size_t, int>{{}, n, flags};
+}
+
+// Suspends until stop token fires, then returns ECANCELED.
+io_task<size_t>
+io_pending_size()
+{
+    co_await stop_only_awaitable{};
+    co_return io_result<size_t>{make_error_code(error::canceled), 0};
+}
+
+} // anonymous namespace
+
+struct when_all_range_test
+{
+    void
+    testSingleElement()
+    {
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::string result;
 
-        // void_task() doesn't contribute to result tuple
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_success_size(42));
+
         run_async(ex,
-            [&](std::tuple<int, std::string> t) {
-                auto [a, b] = t;
+            [&](io_result<std::vector<size_t>> r) {
                 completed = true;
-                result = b + std::to_string(a);
+                BOOST_TEST(!r.ec);
+                BOOST_TEST_EQ(std::get<0>(r.values).size(), 1u);
+                BOOST_TEST_EQ(std::get<0>(r.values)[0], 42u);
             },
             [](std::exception_ptr) {})(
-            when_all(returns_int(42), returns_string("hello"), void_task()));
+            when_all(std::move(tasks)));
 
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(result, "hello42");
     }
 
-    // Test: Single task in when_all
     void
-    testSingleTask()
+    testMultipleElements()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        int result = 0;
+
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_success_size(10));
+        tasks.push_back(io_success_size(20));
+        tasks.push_back(io_success_size(30));
 
         run_async(ex,
-            [&](std::tuple<int> t) {
-                auto [a] = t;
+            [&](io_result<std::vector<size_t>> r) {
                 completed = true;
-                result = a;
+                BOOST_TEST(!r.ec);
+                BOOST_TEST_EQ(std::get<0>(r.values).size(), 3u);
+                BOOST_TEST_EQ(std::get<0>(r.values)[0], 10u);
+                BOOST_TEST_EQ(std::get<0>(r.values)[1], 20u);
+                BOOST_TEST_EQ(std::get<0>(r.values)[2], 30u);
             },
             [](std::exception_ptr) {})(
-            when_all(returns_int(99)));
+            when_all(std::move(tasks)));
 
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(result, 99);
     }
 
-    // Test: First exception captured
     void
-    testFirstException()
+    testEmptyRange()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        bool caught_exception = false;
-        std::string error_msg;
+        int dc = 0;
+        test_executor ex(dc);
+        bool caught = false;
+
+        std::vector<io_task<size_t>> tasks;
 
         run_async(ex,
-            [&](std::tuple<int, int>) { completed = true; },
+            [](io_result<std::vector<size_t>>) {},
             [&](std::exception_ptr ep) {
                 try {
                     std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    caught_exception = true;
-                    error_msg = e.what();
+                } catch (std::invalid_argument const&) {
+                    caught = true;
                 }
-            })(when_all(throws_exception("first error"), returns_int(10)));
+            })(when_all(std::move(tasks)));
 
-        BOOST_TEST(!completed);
-        BOOST_TEST(caught_exception);
-        BOOST_TEST_EQ(error_msg, "first error");
+        BOOST_TEST(caught);
     }
 
-    // Test: Multiple failures - first exception wins
     void
-    testMultipleFailuresFirstWins()
+    testVoidRange()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
-        std::string error_msg;
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+
+        std::vector<io_task<>> tasks;
+        tasks.push_back(io_void_ok());
+        tasks.push_back(io_void_ok());
+        tasks.push_back(io_void_ok());
 
         run_async(ex,
-            [](std::tuple<int, int, int>) {},
-            [&](std::exception_ptr ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    caught_exception = true;
-                    error_msg = e.what();
-                }
-            })(when_all(
-                throws_exception("error_1"),
-                throws_exception("error_2"),
-                throws_exception("error_3")));
-
-        BOOST_TEST(caught_exception);
-        BOOST_TEST(
-            error_msg == "error_1" ||
-            error_msg == "error_2" ||
-            error_msg == "error_3");
+            [&](io_result<> r) {
+                completed = true;
+                BOOST_TEST(!r.ec);
+            },
+            [](std::exception_ptr) {})(
+            when_all(std::move(tasks)));
+
+        BOOST_TEST(completed);
     }
 
-    // Test: Void task throws exception
     void
-    testVoidTaskException()
+    testEmptyVoidRange()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
-        std::string error_msg;
+        int dc = 0;
+        test_executor ex(dc);
+        bool caught = false;
+
+        std::vector<io_task<>> tasks;
 
         run_async(ex,
-            [](std::tuple<int>) {},
+            [](io_result<>) {},
             [&](std::exception_ptr ep) {
                 try {
                     std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    caught_exception = true;
-                    error_msg = e.what();
+                } catch (std::invalid_argument const&) {
+                    caught = true;
                 }
-            })(when_all(returns_int(10), void_throws_exception("void error")));
+            })(when_all(std::move(tasks)));
 
-        BOOST_TEST(caught_exception);
-        BOOST_TEST_EQ(error_msg, "void error");
+        BOOST_TEST(caught);
     }
 
-    // Test: Nested when_all calls
     void
-    testNestedWhenAll()
+    testErrorCancelsSiblings()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        int result = 0;
+        std::error_code result_ec;
 
-        // Helper tasks that use when_all internally
-        auto inner1 = []() -> task<int> {
-            auto [a, b] = co_await when_all(returns_int(1), returns_int(2));
-            co_return a + b;
-        };
-
-        auto inner2 = []() -> task<int> {
-            auto [a, b] = co_await when_all(returns_int(3), returns_int(4));
-            co_return a + b;
-        };
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_error_size(make_error_code(error::eof)));
+        tasks.push_back(io_pending_size());
 
         run_async(ex,
-            [&](std::tuple<int, int> t) {
-                auto [x, y] = t;
+            [&](io_result<std::vector<size_t>> r) {
                 completed = true;
-                result = x + y;
+                result_ec = r.ec;
             },
             [](std::exception_ptr) {})(
-            when_all(inner1(), inner2()));
+            when_all(std::move(tasks)));
 
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(result, 10);  // (1+2) + (3+4) = 10
+        BOOST_TEST(result_ec == cond::eof);
     }
 
-    // Test: All void tasks return void (not empty tuple)
     void
-    testAllVoidTasks()
+    testMultipleErrorsFirstWins()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
+        std::error_code result_ec;
+
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_error_size(make_error_code(error::eof)));
+        tasks.push_back(io_error_size(make_error_code(error::timeout)));
 
-        // All void tasks return void, not std::tuple<>
         run_async(ex,
-            [&]() { completed = true; },
+            [&](io_result<std::vector<size_t>> r) {
+                completed = true;
+                result_ec = r.ec;
+            },
             [](std::exception_ptr) {})(
-            when_all(void_task(), void_task(), void_task()));
+            when_all(std::move(tasks)));
 
         BOOST_TEST(completed);
+        BOOST_TEST(result_ec == cond::eof);
     }
 
-    // Test: Result type correctness - void types filtered, all-void returns void
     void
-    testResultType()
+    testException()
     {
-        // Mixed types: void filtered out
-        using mixed_result = when_all_result_type<int, void, std::string>;
-        static_assert(std::is_same_v<
-            mixed_result,
-            std::tuple<int, std::string>>);
-
-        // All void: returns void (not empty tuple)
-        using all_void_result = when_all_result_type<void, void, void>;
-        static_assert(std::is_void_v<all_void_result>);
-
-        // Single void: returns void
-        using single_void_result = when_all_result_type<void>;
-        static_assert(std::is_void_v<single_void_result>);
-    }
+        int dc = 0;
+        test_executor ex(dc);
+        bool caught = false;
+        std::string msg;
 
-    //----------------------------------------------------------
-    // Stop token propagation tests
-    //----------------------------------------------------------
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_success_size(1));
+        tasks.push_back(io_throws_size("range error"));
+        tasks.push_back(io_success_size(3));
 
-    // Helper: task that records if stop was requested
-    static task<int>
-    checks_stop_token(std::atomic<bool>&)
+        run_async(ex,
+            [](io_result<std::vector<size_t>>) {},
+            [&](std::exception_ptr ep) {
+                try {
+                    std::rethrow_exception(ep);
+                } catch (test_exception const& e) {
+                    caught = true;
+                    msg = e.what();
+                }
+            })(when_all(std::move(tasks)));
+
+        BOOST_TEST(caught);
+        BOOST_TEST_EQ(msg, "range error");
+    }
+
+    void
+    testVoidRangeError()
     {
-        // This task just returns immediately, but in real usage
-        // you would check stop_token in a loop
-        co_return 42;
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+        std::error_code result_ec;
+
+        std::vector<io_task<>> tasks;
+        tasks.push_back(io_void_ok());
+        tasks.push_back(io_void_error(make_error_code(error::eof)));
+
+        run_async(ex,
+            [&](io_result<> r) {
+                completed = true;
+                result_ec = r.ec;
+            },
+            [](std::exception_ptr) {})(
+            when_all(std::move(tasks)));
+
+        BOOST_TEST(completed);
+        BOOST_TEST(result_ec == cond::eof);
     }
 
-    // Helper: stoppable task that honors stop requests
-    static task<int>
-    stoppable_task(std::atomic<int>& counter)
+    void
+    testVoidRangeException()
     {
-        ++counter;
-        co_return counter.load();
+        int dc = 0;
+        test_executor ex(dc);
+        bool caught = false;
+
+        std::vector<io_task<>> tasks;
+        tasks.push_back(io_void_ok());
+        tasks.push_back(io_void_throws("void range error"));
+
+        run_async(ex,
+            [](io_result<>) {},
+            [&](std::exception_ptr ep) {
+                try {
+                    std::rethrow_exception(ep);
+                } catch (test_exception const&) {
+                    caught = true;
+                }
+            })(when_all(std::move(tasks)));
+
+        BOOST_TEST(caught);
     }
 
-    // Test: Stop is requested when a sibling fails
     void
-    testStopRequestedOnError()
+    testExceptionBeatsError()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
+        int dc = 0;
+        test_executor ex(dc);
+        bool caught = false;
+
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_throws_size("exception wins"));
+        tasks.push_back(io_error_size(make_error_code(error::eof)));
 
         run_async(ex,
-            [](std::tuple<int, int>) {},
-            [&](std::exception_ptr) {
-                caught_exception = true;
-            })(when_all(throws_exception("error"), returns_int(10)));
+            [](io_result<std::vector<size_t>>) {},
+            [&](std::exception_ptr ep) {
+                try {
+                    std::rethrow_exception(ep);
+                } catch (test_exception const&) {
+                    caught = true;
+                }
+            })(when_all(std::move(tasks)));
 
-        // Exception should propagate - stop was requested internally
-        BOOST_TEST(caught_exception);
+        BOOST_TEST(caught);
     }
 
-    // Test: All tasks complete even after stop is requested
     void
-    testAllTasksCompleteAfterStop()
+    testAllTasksCompleteAfterError()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         std::atomic<int> completion_count{0};
-        bool caught_exception = false;
+        bool completed = false;
 
-        auto counting_task = [&]() -> task<int> {
+        auto counting_io = [&]() -> io_task<size_t> {
             ++completion_count;
-            co_return 1;
+            co_return io_result<size_t>{{}, 1};
         };
 
-        auto failing_task = [&]() -> task<int> {
+        auto failing_io = [&]() -> io_task<size_t> {
             ++completion_count;
-            throw_test_exception("fail");
-            co_return 0;
+            co_return io_result<size_t>{make_error_code(error::eof), 0};
         };
 
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(counting_io());
+        tasks.push_back(failing_io());
+        tasks.push_back(counting_io());
+
         run_async(ex,
-            [](std::tuple<int, int, int>) {},
-            [&](std::exception_ptr) {
-                caught_exception = true;
-            })(when_all(
-                counting_task(),
-                failing_task(),
-                counting_task()));
+            [&](io_result<std::vector<size_t>>) {
+                completed = true;
+            },
+            [](std::exception_ptr) {})(
+            when_all(std::move(tasks)));
 
-        BOOST_TEST(caught_exception);
-        // All three tasks should have run to completion
+        BOOST_TEST(completed);
         BOOST_TEST_EQ(completion_count.load(), 3);
     }
 
-    //----------------------------------------------------------
-    // Edge case tests
-    //----------------------------------------------------------
+    void
+    testErrorViaSuccessHandler()
+    {
+        int dc = 0;
+        test_executor ex(dc);
+        bool success_called = false;
+        bool error_called = false;
+
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_error_size(make_error_code(error::eof)));
+
+        run_async(ex,
+            [&](io_result<std::vector<size_t>> r) {
+                success_called = true;
+                BOOST_TEST(!!r.ec);
+            },
+            [&](std::exception_ptr) {
+                error_called = true;
+            })(when_all(std::move(tasks)));
+
+        BOOST_TEST(success_called);
+        BOOST_TEST(!error_called);
+    }
 
-    // Test: Large number of tasks
     void
-    testManyTasks()
+    testStringResults()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        int result = 0;
+
+        std::vector<io_task<std::string>> tasks;
+        tasks.push_back(io_success_string("first"));
+        tasks.push_back(io_success_string("second"));
+        tasks.push_back(io_success_string("third"));
 
         run_async(ex,
-            [&](auto t) {
-                auto [a, b, c, d, e, f, g, h] = t;
+            [&](io_result<std::vector<std::string>> r) {
                 completed = true;
-                result = a + b + c + d + e + f + g + h;
+                BOOST_TEST(!r.ec);
+                BOOST_TEST_EQ(std::get<0>(r.values)[0], "first");
+                BOOST_TEST_EQ(std::get<0>(r.values)[1], "second");
+                BOOST_TEST_EQ(std::get<0>(r.values)[2], "third");
             },
-            [](std::exception_ptr) {})(when_all(
-                returns_int(1), returns_int(2), returns_int(3), returns_int(4),
-                returns_int(5), returns_int(6), returns_int(7), returns_int(8)));
+            [](std::exception_ptr) {})(
+            when_all(std::move(tasks)));
 
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(result, 36);  // 1+2+3+4+5+6+7+8 = 36
-    }
-
-    // Test: Task that does multiple internal operations
-    static task<int>
-    multi_step_task(int start)
-    {
-        int value = start;
-        // Simulate multiple steps by nesting tasks
-        value += co_await returns_int(1);
-        value += co_await returns_int(2);
-        co_return value;
     }
 
     void
-    testTasksWithMultipleSteps()
+    testNestedRangeInVariadic()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        int result = 0;
+
+        auto range_task = []() -> io_task<std::vector<size_t>> {
+            std::vector<io_task<size_t>> tasks;
+            tasks.push_back(io_success_size(1));
+            tasks.push_back(io_success_size(2));
+            tasks.push_back(io_success_size(3));
+            co_return co_await when_all(std::move(tasks));
+        };
+
+        auto io_size_task = []() -> io_task<size_t> {
+            co_return io_result<size_t>{{}, 99};
+        };
 
         run_async(ex,
-            [&](std::tuple<int, int> t) {
-                auto [a, b] = t;
+            [&](io_result<std::vector<size_t>, size_t> r) {
                 completed = true;
-                result = a + b;
+                BOOST_TEST(!r.ec);
+                BOOST_TEST_EQ(std::get<0>(r.values).size(), 3u);
+                BOOST_TEST_EQ(std::get<0>(r.values)[0] + std::get<0>(r.values)[1] + std::get<0>(r.values)[2], 6u);
+                BOOST_TEST_EQ(std::get<1>(r.values), 99u);
             },
             [](std::exception_ptr) {})(
-            when_all(multi_step_task(10), multi_step_task(20)));
+            when_all(range_task(), io_size_task()));
 
         BOOST_TEST(completed);
-        // (10+1+2) + (20+1+2) = 13 + 23 = 36
-        BOOST_TEST_EQ(result, 36);
     }
 
-    // Test: Different exception types - first wins
-    struct other_exception : std::runtime_error
+    void
+    testStrandRange()
     {
-        explicit other_exception(const char* msg)
-            : std::runtime_error(msg)
-        {
-        }
-    };
-
-    static task<int>
-    throws_other_exception(char const* msg)
+        thread_pool pool(2);
+        strand s{pool.get_executor()};
+        std::latch done(1);
+        bool completed = false;
+        size_t result = 0;
+
+        auto outer = [&]() -> task<io_result<std::vector<size_t>>> {
+            std::vector<io_task<size_t>> tasks;
+            tasks.push_back(io_success_size(10));
+            tasks.push_back(io_success_size(20));
+            co_return co_await when_all(std::move(tasks));
+        };
+
+        run_async(s,
+            [&](io_result<std::vector<size_t>> r) {
+                completed = true;
+                BOOST_TEST(!r.ec);
+                result = std::get<0>(r.values)[0] + std::get<0>(r.values)[1];
+                done.count_down();
+            },
+            [&](auto) {
+                done.count_down();
+            }
+        )(outer());
+
+        done.wait();
+        BOOST_TEST(completed);
+        BOOST_TEST_EQ(result, 30u);
+    }
+
+    void
+    run()
     {
-        throw other_exception(msg);
-        co_return 0;
+        testSingleElement();
+        testMultipleElements();
+        testEmptyRange();
+        testVoidRange();
+        testEmptyVoidRange();
+        testErrorCancelsSiblings();
+        testMultipleErrorsFirstWins();
+        testException();
+        testVoidRangeError();
+        testVoidRangeException();
+        testExceptionBeatsError();
+        testAllTasksCompleteAfterError();
+        testErrorViaSuccessHandler();
+        testStringResults();
+        testNestedRangeInVariadic();
+        testStrandRange();
     }
+};
+
+TEST_SUITE(
+    when_all_range_test,
+    "boost.capy.when_all_range");
 
+// Tests for io_result-aware when_all behavior per the combinators spec.
+// Each test is labelled with the spec row it verifies.
+struct when_all_io_result_test
+{
+    // Spec Row 1: All tasks return !ec
+    // Return tuple of all results. No cancellation.
     void
-    testDifferentExceptionTypes()
+    testAllSucceed()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_test = false;
-        bool caught_other = false;
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+        size_t n1 = 0, n2 = 0, n3 = 0;
 
         run_async(ex,
-            [](std::tuple<int, int>) {},
-            [&](std::exception_ptr ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (test_exception const&) {
-                    caught_test = true;
-                } catch (other_exception const&) {
-                    caught_other = true;
-                }
-            })(when_all(throws_exception("test"), throws_other_exception("other")));
+            [&](io_result<size_t, size_t, size_t> r) {
+                completed = true;
+                BOOST_TEST(!r.ec);
+                n1 = std::get<0>(r.values);
+                n2 = std::get<1>(r.values);
+                n3 = std::get<2>(r.values);
+            },
+            [](std::exception_ptr) {})(
+            when_all(
+                io_success_size(10),
+                io_success_size(20),
+                io_success_size(30)));
 
-        // One of them should be caught (first to fail wins)
-        BOOST_TEST(caught_test || caught_other);
-        // But not both
-        BOOST_TEST(!(caught_test && caught_other));
+        BOOST_TEST(completed);
+        BOOST_TEST_EQ(n1, 10u);
+        BOOST_TEST_EQ(n2, 20u);
+        BOOST_TEST_EQ(n3, 30u);
     }
 
-    //----------------------------------------------------------
-    // Executor propagation tests
-    //----------------------------------------------------------
-
-    // Executor that tracks which tasks were dispatched
-    struct tracking_executor
+    // Spec Row 1 (single child)
+    void
+    testSingleTaskSuccess()
     {
-        std::atomic<int>* dispatch_count_;
-        test_io_context* ctx_ = nullptr;
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+        size_t result = 0;
 
-        explicit tracking_executor(std::atomic<int>& count)
-            : dispatch_count_(&count)
-        {
-        }
+        run_async(ex,
+            [&](io_result<size_t> r) {
+                completed = true;
+                BOOST_TEST(!r.ec);
+                result = std::get<0>(r.values);
+            },
+            [](std::exception_ptr) {})(
+            when_all(io_success_size(42)));
 
-        bool operator==(tracking_executor const& other) const noexcept
-        {
-            return dispatch_count_ == other.dispatch_count_;
-        }
+        BOOST_TEST(completed);
+        BOOST_TEST_EQ(result, 42u);
+    }
 
-        test_io_context& context() const noexcept
-        {
-            return ctx_ ? *ctx_ : default_test_io_context();
-        }
+    // Spec Row 2: One task returns ec, others pending
+    // Cancel siblings. Propagate error.
+    void
+    testOneErrorCancelsSiblings()
+    {
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+        std::error_code result_ec;
 
-        void on_work_started() const noexcept {}
-        void on_work_finished() const noexcept {}
+        run_async(ex,
+            [&](io_result<size_t, size_t> r) {
+                completed = true;
+                result_ec = r.ec;
+            },
+            [](std::exception_ptr) {})(
+            when_all(
+                io_error_size(make_error_code(error::eof)),
+                io_pending_size()));
 
-    std::coroutine_handle<> dispatch(std::coroutine_handle<> h) const
-    {
-        ++(*dispatch_count_);
-        return h;
+        BOOST_TEST(completed);
+        BOOST_TEST(result_ec == cond::eof);
     }
 
-    void post(std::coroutine_handle<> h) const
+    // Spec Row 3: Multiple tasks return ec concurrently
+    // Each triggers stop (idempotent). First ec wins.
+    void
+    testMultipleErrorsFirstWins()
     {
-        h.resume();
-    }
-    };
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+        std::error_code result_ec;
+
+        run_async(ex,
+            [&](io_result<size_t, size_t> r) {
+                completed = true;
+                result_ec = r.ec;
+            },
+            [](std::exception_ptr) {})(
+            when_all(
+                io_error_size(make_error_code(error::eof)),
+                io_error_size(make_error_code(error::timeout))));
 
-    static_assert(Executor<tracking_executor>);
+        BOOST_TEST(completed);
+        BOOST_TEST(result_ec == cond::eof);
+    }
 
+    // Spec Row 4: ec == eof, n == 0
+    // Error. Cancel siblings.
     void
-    testDispatcherUsedForAllTasks()
+    testEofWithZeroBytes()
     {
-        std::atomic<int> dispatch_count{0};
-        tracking_executor tex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
+        std::error_code result_ec;
 
-        run_async(tex,
-            [&](std::tuple<int, int, int> t) {
-                auto [a, b, c] = t;
+        run_async(ex,
+            [&](io_result<size_t, size_t> r) {
                 completed = true;
-                BOOST_TEST_EQ(a + b + c, 6);
+                result_ec = r.ec;
             },
             [](std::exception_ptr) {})(
-            when_all(returns_int(1), returns_int(2), returns_int(3)));
+            when_all(
+                io_error_size(make_error_code(error::eof), 0),
+                io_pending_size()));
 
         BOOST_TEST(completed);
-        // Dispatcher should be called for:
-        // - run_async initial dispatch
-        // - when_all runners (3)
-        // - signal_completion resumption
-        BOOST_TEST(dispatch_count.load() > 0);
+        BOOST_TEST(result_ec == cond::eof);
     }
 
-    //----------------------------------------------------------
-    // Result ordering tests
-    //----------------------------------------------------------
-
-    // Test: Results are in input order regardless of completion order
+    // Spec Row 5: ec != 0, n > 0 (partial transfer)
+    // Error. Cancel siblings. Values stored as-is.
     void
-    testResultsInInputOrder()
+    testPartialTransferIsError()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
+        std::error_code result_ec;
+        size_t partial = 0;
 
         run_async(ex,
-            [&](std::tuple<std::string, std::string, std::string> t) {
-                auto [first, second, third] = t;
-                BOOST_TEST_EQ(first, "first");
-                BOOST_TEST_EQ(second, "second");
-                BOOST_TEST_EQ(third, "third");
+            [&](io_result<size_t> r) {
                 completed = true;
+                result_ec = r.ec;
+                partial = std::get<0>(r.values);
             },
-            [](std::exception_ptr) {})(when_all(
-                returns_string("first"),
-                returns_string("second"),
-                returns_string("third")));
+            [](std::exception_ptr) {})(
+            when_all(
+                io_error_size(make_error_code(error::eof), 42)));
 
         BOOST_TEST(completed);
+        BOOST_TEST(result_ec == cond::eof);
+        BOOST_TEST_EQ(partial, 42u);
     }
 
-    // Test: Mixed void and value results maintain order
+    // Spec Row 5 (with sibling)
     void
-    testMixedVoidValueOrder()
+    testPartialTransferValuePreserved()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
+        std::error_code result_ec;
+        size_t n1 = 0;
 
-        // void at index 1, values at 0 and 2
         run_async(ex,
-            [&](std::tuple<int, int> t) {
-                // a should be from index 0, b from index 2
-                auto [a, b] = t;
-                BOOST_TEST_EQ(a, 100);
-                BOOST_TEST_EQ(b, 300);
+            [&](io_result<size_t, size_t> r) {
                 completed = true;
+                result_ec = r.ec;
+                n1 = std::get<0>(r.values);
             },
             [](std::exception_ptr) {})(
-            when_all(returns_int(100), void_task(), returns_int(300)));
+            when_all(
+                io_error_size(make_error_code(error::eof), 42),
+                io_pending_size()));
 
         BOOST_TEST(completed);
+        BOOST_TEST(result_ec == cond::eof);
+        BOOST_TEST_EQ(n1, 42u);
     }
 
-    //----------------------------------------------------------
-    // Awaitable lifecycle tests
-    //----------------------------------------------------------
-
-    // Test: when_all_awaitable is move constructible
+    // Spec Row 6: Zero-length buffer, ({}, 0)
+    // Success. No cancellation.
     void
-    testAwaitableMoveConstruction()
+    testZeroTransferSuccess()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
 
-        auto awaitable1 = when_all(returns_int(1), returns_int(2));
-        auto awaitable2 = std::move(awaitable1);
-
         run_async(ex,
-            [&](std::tuple<int, int> t) {
-                auto [a, b] = t;
+            [&](io_result<size_t> r) {
                 completed = true;
-                BOOST_TEST_EQ(a + b, 3);
+                BOOST_TEST(!r.ec);
+                BOOST_TEST_EQ(std::get<0>(r.values), 0u);
             },
-            [](std::exception_ptr) {})(std::move(awaitable2));
+            [](std::exception_ptr) {})(
+            when_all(io_success_size(0)));
 
         BOOST_TEST(completed);
     }
 
-    // Test: when_all can be stored and awaited later
+    // Spec Row 7: Zero-length buffer, (ec, 0)
+    // Error (ec reflects stream state). Cancel siblings.
     void
-    testDeferredAwait()
+    testZeroTransferError()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
+        std::error_code result_ec;
 
-        auto deferred = when_all(returns_int(10), returns_int(20));
-        // Await later
         run_async(ex,
-            [&](std::tuple<int, int> t) {
-                auto [a, b] = t;
+            [&](io_result<size_t> r) {
                 completed = true;
-                BOOST_TEST_EQ(a + b, 30);
+                result_ec = r.ec;
             },
-            [](std::exception_ptr) {})(std::move(deferred));
+            [](std::exception_ptr) {})(
+            when_all(
+                io_error_size(make_error_code(error::eof), 0)));
 
         BOOST_TEST(completed);
+        BOOST_TEST(result_ec == cond::eof);
     }
 
-    //----------------------------------------------------------
-    // Stoppable awaitable protocol tests
-    //----------------------------------------------------------
-
-    // Test: when_all returns task which satisfies IoAwaitable concept
+    // Spec Row 8: One task throws
+    // Capture exception. Cancel siblings. Rethrow after all complete.
     void
-    testIoAwaitableConcept()
+    testOneThrows()
     {
-        // when_all now returns task<T>, which satisfies the awaitable protocols
-        static_assert(IoAwaitable<
-            task<std::tuple<int, int>>>);
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+        bool caught = false;
+        std::string msg;
 
-        static_assert(IoAwaitable<
-            task<std::tuple<int, std::string>>>);
+        run_async(ex,
+            [&](io_result<size_t, size_t>) { completed = true; },
+            [&](std::exception_ptr ep) {
+                try { std::rethrow_exception(ep); }
+                catch (test_exception const& e) {
+                    caught = true;
+                    msg = e.what();
+                }
+            })(when_all(
+                io_throws_size("boom"),
+                io_pending_size()));
 
-        static_assert(IoAwaitable<
-            task<void>>);
+        BOOST_TEST(!completed);
+        BOOST_TEST(caught);
+        BOOST_TEST_EQ(msg, "boom");
     }
 
-    // Test: Nested when_all propagates stop
+    // Spec Row 9: Multiple tasks throw
+    // First exception captured. Others discarded. Rethrow first.
     void
-    testNestedWhenAllStopPropagation()
+    testMultipleThrowsFirstWins()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
-
-        auto inner_failing = []() -> task<int> {
-            auto [a, b] = co_await when_all(
-                throws_exception("inner error"),
-                returns_int(1)
-            );
-            co_return a + b;
-        };
-
-        auto inner_success = []() -> task<int> {
-            auto [a, b] = co_await when_all(
-                returns_int(2),
-                returns_int(3)
-            );
-            co_return a + b;
-        };
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+        bool caught = false;
+        std::string msg;
 
         run_async(ex,
-            [](std::tuple<int, int>) {},
+            [&](io_result<size_t, size_t>) { completed = true; },
             [&](std::exception_ptr ep) {
-                caught_exception = true;
-                try {
-                    std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    BOOST_TEST_EQ(std::string(e.what()), "inner error");
+                try { std::rethrow_exception(ep); }
+                catch (test_exception const& e) {
+                    caught = true;
+                    msg = e.what();
                 }
-            })(when_all(inner_failing(), inner_success()));
+            })(when_all(
+                io_throws_size("first"),
+                io_throws_size("second")));
 
-        BOOST_TEST(caught_exception);
+        BOOST_TEST(!completed);
+        BOOST_TEST(caught);
+        BOOST_TEST_EQ(msg, "first");
     }
 
+    // Spec Row 10: One throws, another returns ec (either order)
+    // Exception always wins.
     void
-    run()
+    testExceptionBeatsError()
     {
-        // Basic functionality
-        testResultType();
-        testAllSucceed();
-        testThreeTasksSucceed();
-        testMixedTypes();
-        testSingleTask();
-        testFirstException();
-        testMultipleFailuresFirstWins();
-        testVoidTaskException();
-        testNestedWhenAll();
-        testAllVoidTasks();
-
-        // Stop token propagation
-        testStopRequestedOnError();
-        testAllTasksCompleteAfterStop();
-
-        // Edge cases
-        testManyTasks();
-        testTasksWithMultipleSteps();
-        testDifferentExceptionTypes();
-
-        // Dispatcher propagation
-        testDispatcherUsedForAllTasks();
-
-        // Result ordering
-        testResultsInInputOrder();
-        testMixedVoidValueOrder();
-
-        // Awaitable lifecycle
-        testAwaitableMoveConstruction();
-        testDeferredAwait();
-
-        // Stoppable awaitable protocol
-        testIoAwaitableConcept();
-        testNestedWhenAllStopPropagation();
-
-        // Frame allocator tests - skipped: allocator is currently ignored per design
-        // testWhenAllUsesAllocator();
-        // testNestedWhenAllUsesAllocator();
-    }
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+        bool caught = false;
+        std::string msg;
 
-    //----------------------------------------------------------
-    // Frame allocator tests
-    //----------------------------------------------------------
+        run_async(ex,
+            [&](io_result<size_t, size_t>) { completed = true; },
+            [&](std::exception_ptr ep) {
+                try { std::rethrow_exception(ep); }
+                catch (test_exception const& e) {
+                    caught = true;
+                    msg = e.what();
+                }
+            })(when_all(
+                io_throws_size("exception wins"),
+                io_error_size(make_error_code(error::eof))));
 
-    /** Tracking frame allocator that logs allocation events.
-    */
-    template<class T = std::byte>
-    struct tracking_frame_allocator
-    {
-        using value_type = T;
-
-        template<class U>
-        struct rebind { using other = tracking_frame_allocator<U>; };
-
-        int id;
-        int* alloc_count;
-        int* dealloc_count;
-        std::vector<int>* alloc_log;
-
-        tracking_frame_allocator(int id_, int* ac, int* dc, std::vector<int>* log)
-            : id(id_), alloc_count(ac), dealloc_count(dc), alloc_log(log) {}
-
-        template<class U>
-        tracking_frame_allocator(const tracking_frame_allocator<U>& o)
-            : id(o.id), alloc_count(o.alloc_count), dealloc_count(o.dealloc_count), alloc_log(o.alloc_log) {}
-
-        T* allocate(std::size_t n)
-        {
-            ++(*alloc_count);
-            if(alloc_log)
-                alloc_log->push_back(id);
-            return static_cast<T*>(::operator new(n * sizeof(T)));
-        }
-
-        void deallocate(T* p, std::size_t)
-        {
-            ++(*dealloc_count);
-            ::operator delete(p);
-        }
-    };
+        BOOST_TEST(!completed);
+        BOOST_TEST(caught);
+        BOOST_TEST_EQ(msg, "exception wins");
+    }
 
+    // Spec Row 10 (reversed): error first, then throw
+    // Exception still wins.
     void
-    testWhenAllUsesAllocator()
+    testExceptionBeatsErrorReversed()
     {
-        // Verify that when_all() coroutines use the custom allocator
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
+        bool caught = false;
 
-        int alloc_count = 0;
-        int dealloc_count = 0;
-        std::vector<int> alloc_log;
+        run_async(ex,
+            [&](io_result<size_t, size_t>) { completed = true; },
+            [&](std::exception_ptr ep) {
+                try { std::rethrow_exception(ep); }
+                catch (test_exception const&) { caught = true; }
+            })(when_all(
+                io_error_size(make_error_code(error::eof)),
+                io_throws_size("exception")));
+
+        BOOST_TEST(!completed);
+        BOOST_TEST(caught);
+    }
 
-        tracking_frame_allocator<> alloc{1, &alloc_count, &dealloc_count, &alloc_log};
+    // Spec Row 11: Parent stop token fires
+    // Not a special case. Children return ECANCELED,
+    // which is an error like any other. First ec wins.
+    void
+    testCanceledIsNormalError()
+    {
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+        std::error_code result_ec;
 
-        run_async(ex, std::stop_token{}, alloc,
-            [&](std::tuple<int, int, int> t) {
-                auto [a, b, c] = t;
+        run_async(ex,
+            [&](io_result<size_t, size_t> r) {
                 completed = true;
-                BOOST_TEST_EQ(a + b + c, 60);
+                result_ec = r.ec;
             },
             [](std::exception_ptr) {})(
-            when_all(returns_int(10), returns_int(20), returns_int(30)));
+            when_all(
+                io_error_size(make_error_code(error::canceled)),
+                io_error_size(make_error_code(error::canceled))));
 
         BOOST_TEST(completed);
-        // when_all should have allocated frames through our allocator
-        BOOST_TEST_GE(alloc_count, 1);
-        // All allocations should use our allocator
-        for(int id : alloc_log)
-            BOOST_TEST_EQ(id, 1);
-        // All allocations should be deallocated
-        BOOST_TEST_EQ(alloc_count, dealloc_count);
+        BOOST_TEST(result_ec == cond::canceled);
     }
 
+    // Spec Row 12: All tasks fail
+    // Propagate single error_code (first wins). Not a tuple of failures.
     void
-    testNestedWhenAllUsesAllocator()
+    testAllFail()
     {
-        // Verify nested when_all calls also use the allocator
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
+        std::error_code result_ec;
 
-        int alloc_count = 0;
-        int dealloc_count = 0;
-        std::vector<int> alloc_log;
-
-        tracking_frame_allocator<> alloc{1, &alloc_count, &dealloc_count, &alloc_log};
-
-        auto inner1 = []() -> task<int> {
-            auto [a, b] = co_await when_all(returns_int(1), returns_int(2));
-            co_return a + b;
-        };
-
-        auto inner2 = []() -> task<int> {
-            auto [a, b] = co_await when_all(returns_int(3), returns_int(4));
-            co_return a + b;
-        };
-
-        int result = 0;
-        run_async(ex, std::stop_token{}, alloc,
-            [&](std::tuple<int, int> t) {
-                auto [x, y] = t;
+        run_async(ex,
+            [&](io_result<size_t, size_t, size_t> r) {
                 completed = true;
-                result = x + y;
+                result_ec = r.ec;
             },
             [](std::exception_ptr) {})(
-            when_all(inner1(), inner2()));
+            when_all(
+                io_error_size(make_error_code(error::eof)),
+                io_error_size(make_error_code(error::timeout)),
+                io_error_size(make_error_code(error::canceled))));
 
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(result, 10);  // (1+2) + (3+4) = 10
-        // Nested when_all should also allocate through our allocator
-        BOOST_TEST_GE(alloc_count, 1);
-        // All allocations should use our allocator
-        for(int id : alloc_log)
-            BOOST_TEST_EQ(id, 1);
-        // All allocations should be deallocated
-        BOOST_TEST_EQ(alloc_count, dealloc_count);
+        BOOST_TEST(result_ec == cond::eof);
     }
-};
 
-TEST_SUITE(
-    when_all_test,
-    "boost.capy.when_all");
-
-//----------------------------------------------------------
-// IoAwaitable (non-task) tests for when_all
-//----------------------------------------------------------
-
-struct when_all_io_awaitable_test
-{
-    // Test: when_all with stop_only_awaitable and task<int>
-    // stop_only_awaitable only completes via cancellation, so we
-    // provide a parent stop_source to trigger both completions.
+    // Spec Row 13: Failure reaches caller via io_result's ec
+    // Error goes through success handler, not exception handler.
     void
-    testStopOnlyAwaitableWithTask()
+    testErrorViaSuccessHandler()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
-        bool completed = false;
-        int result = 0;
+        int dc = 0;
+        test_executor ex(dc);
+        bool success_called = false;
+        bool error_called = false;
 
-        std::stop_source parent_stop;
-
-        run_async(ex, parent_stop.get_token(),
-            [&](std::tuple<int> t) {
-                completed = true;
-                result = std::get<0>(t);
+        run_async(ex,
+            [&](io_result<size_t> r) {
+                success_called = true;
+                BOOST_TEST(!!r.ec);
             },
-            [](std::exception_ptr) {})(
-            when_all(stop_only_awaitable{}, returns_int(42)));
-
-        // stop_only_awaitable needs cancellation to complete
-        parent_stop.request_stop();
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            [&](std::exception_ptr) {
+                error_called = true;
+            })(when_all(
+                io_error_size(make_error_code(error::eof))));
 
-        BOOST_TEST(completed);
-        BOOST_TEST_EQ(result, 42);
+        BOOST_TEST(success_called);
+        BOOST_TEST(!error_called);
     }
 
-    // Test: when_all with async_event wait_awaiter and task<int>
+    // Spec Row 14 (mixed value types)
     void
-    testAsyncEventWaitWithTask()
+    testMixedValueTypes()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        int result = 0;
+        size_t n = 0;
+        std::string s;
 
-        async_event event;
-
-        // event.wait() returns io_result<>, returns_int returns int
-        // Result: tuple<io_result<>, int>
         run_async(ex,
-            [&](auto&& t) {
+            [&](io_result<size_t, std::string> r) {
                 completed = true;
-                result = std::get<1>(t);
+                BOOST_TEST(!r.ec);
+                n = std::get<0>(r.values);
+                s = std::get<1>(r.values);
             },
             [](std::exception_ptr) {})(
-            when_all(event.wait(), returns_int(42)));
-
-        // Set event so the waiter can complete
-        event.set();
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            when_all(
+                io_success_size(42),
+                io_success_string("hello")));
 
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(result, 42);
+        BOOST_TEST_EQ(n, 42u);
+        BOOST_TEST_EQ(s, "hello");
     }
 
-    // Test: when_all with two stop_only_awaitables
+    // Spec Row 14 (multi-value child: io_result<T1, T2> contributes tuple<T1, T2>)
     void
-    testTwoStopOnlyAwaitables()
+    testMultiValueChild()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
+        size_t n = 0;
+        std::tuple<size_t, int> tf;
 
-        std::stop_source parent_stop;
-
-        run_async(ex, parent_stop.get_token(),
-            [&]() {
+        run_async(ex,
+            [&](io_result<size_t, std::tuple<size_t, int>> r) {
                 completed = true;
+                BOOST_TEST(!r.ec);
+                n = std::get<0>(r.values);
+                tf = std::get<1>(r.values);
             },
             [](std::exception_ptr) {})(
-            when_all(stop_only_awaitable{}, stop_only_awaitable{}));
-
-        // Neither can complete on their own - request parent stop
-        parent_stop.request_stop();
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            when_all(
+                io_success_size(42),
+                io_success_size_int(10, 7)));
 
         BOOST_TEST(completed);
+        BOOST_TEST_EQ(n, 42u);
+        BOOST_TEST_EQ(std::get<0>(tf), 10u);
+        BOOST_TEST_EQ(std::get<1>(tf), 7);
     }
 
-    // Test: when_all with io_task<> types
+    // Spec Row 14 (void results: io_result<> contributes tuple<>)
     void
-    testIoTaskWithWhenAll()
+    testVoidResults()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
+        size_t n = 0;
 
-        auto io_op = []() -> io_task<> {
-            co_return io_result<>{{}};
-        };
-
-        // io_task<> is task<io_result<>>, result: tuple<io_result<>, io_result<>>
         run_async(ex,
-            [&](auto&&) {
+            [&](io_result<size_t, std::tuple<>> r) {
                 completed = true;
+                BOOST_TEST(!r.ec);
+                n = std::get<0>(r.values);
             },
             [](std::exception_ptr) {})(
-            when_all(io_op(), io_op()));
+            when_all(
+                io_success_size(42),
+                io_void_ok()));
 
         BOOST_TEST(completed);
+        BOOST_TEST_EQ(n, 42u);
     }
 
-    // Test: when_all with mixed io_task and regular task
+    // First error in time wins, not first in tuple order.
+    // Child 0 (pending) gets cancelled after child 1 fails with eof.
+    // The outer ec must be eof, not canceled.
     void
-    testMixedIoTaskAndRegularTask()
+    testFirstErrorInTimeWins()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        int int_result = 0;
-
-        auto io_read = [](std::size_t n) -> io_task<std::size_t> {
-            co_return io_result<std::size_t>{{}, n};
-        };
+        std::error_code result_ec;
 
         run_async(ex,
-            [&](auto&& t) {
+            [&](io_result<size_t, size_t> r) {
                 completed = true;
-                int_result = std::get<0>(t);
+                result_ec = r.ec;
             },
             [](std::exception_ptr) {})(
-            when_all(returns_int(99), io_read(200)));
+            when_all(
+                io_pending_size(),
+                io_error_size(make_error_code(error::eof))));
 
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(int_result, 99);
+        BOOST_TEST(result_ec == cond::eof);
     }
 
     void
     run()
     {
-        testStopOnlyAwaitableWithTask();
-        testAsyncEventWaitWithTask();
-        testTwoStopOnlyAwaitables();
-        testIoTaskWithWhenAll();
-        testMixedIoTaskAndRegularTask();
+        testAllSucceed();
+        testSingleTaskSuccess();
+        testOneErrorCancelsSiblings();
+        testMultipleErrorsFirstWins();
+        testEofWithZeroBytes();
+        testPartialTransferIsError();
+        testPartialTransferValuePreserved();
+        testZeroTransferSuccess();
+        testZeroTransferError();
+        testOneThrows();
+        testMultipleThrowsFirstWins();
+        testExceptionBeatsError();
+        testExceptionBeatsErrorReversed();
+        testCanceledIsNormalError();
+        testAllFail();
+        testErrorViaSuccessHandler();
+        testMixedValueTypes();
+        testMultiValueChild();
+        testVoidResults();
+        testFirstErrorInTimeWins();
     }
 };
 
 TEST_SUITE(
-    when_all_io_awaitable_test,
-    "boost.capy.when_all_io_awaitable");
+    when_all_io_result_test,
+    "boost.capy.when_all_io_result");
 
 } // capy
 } // boost
diff --git a/test/unit/when_any.cpp b/test/unit/when_any.cpp
index 7915da911..48e0b6133 100644
--- a/test/unit/when_any.cpp
+++ b/test/unit/when_any.cpp
@@ -1,5 +1,6 @@
 //
 // Copyright (c) 2026 Michael Vandeberg
+// Copyright (c) 2026 Steve Gerbino
 //
 // Distributed under the Boost Software License, Version 1.0. (See accompanying
 // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
@@ -10,13 +11,12 @@
 // Test that header file is self-contained.
 #include <boost/capy/when_any.hpp>
 
-#include <boost/capy/ex/async_event.hpp>
-#include <boost/capy/ex/execution_context.hpp>
+#include <boost/capy/cond.hpp>
+#include <boost/capy/error.hpp>
 #include <boost/capy/ex/run_async.hpp>
 #include <boost/capy/ex/this_coro.hpp>
 #include <boost/capy/io_task.hpp>
 #include <boost/capy/task.hpp>
-#include <boost/capy/when_all.hpp>
 
 #include "test_helpers.hpp"
 #include "test_suite.hpp"
@@ -25,1486 +25,378 @@
 #include <queue>
 #include <stdexcept>
 #include <string>
+#include <system_error>
 #include <type_traits>
+#include <variant>
 
 namespace boost {
 namespace capy {
 
-struct when_any_test
-{
-    //----------------------------------------------------------
-    // Basic functionality tests
-    //----------------------------------------------------------
-
-    // Test: Single task returns immediately
-    void
-    testSingleTask()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        int result = 0;
-        std::size_t winner_index = 999;
-
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                winner_index = r.first;
-                result = std::get<0>(r.second);
-            },
-            [](std::exception_ptr) {})(
-            when_any(returns_int(42)));
-
-        BOOST_TEST(completed);
-        BOOST_TEST_EQ(winner_index, 0u);
-        BOOST_TEST_EQ(result, 42);
-    }
-
-    // Test: Two tasks - first completes wins
-    void
-    testTwoTasksFirstWins()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        std::size_t winner_index = 999;
-        int result_value = 0;
-
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                winner_index = r.first;
-                // Variant is deduplicated to single int type
-                result_value = std::get<int>(r.second);
-            },
-            [](std::exception_ptr) {})(
-            when_any(returns_int(10), returns_int(20)));
-
-        BOOST_TEST(completed);
-        // One of them should win, with correct index-to-value mapping
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
-        if (winner_index == 0)
-            BOOST_TEST_EQ(result_value, 10);
-        else
-            BOOST_TEST_EQ(result_value, 20);
-    }
-
-    // Test: Three tasks with different types
-    void
-    testMixedTypes()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        std::size_t winner_index = 999;
-        std::variant<int, std::string> result_value;
-
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                winner_index = r.first;
-                result_value = r.second;
-            },
-            [](std::exception_ptr) {})(
-            when_any(returns_int(1), returns_string("hello"), returns_int(3)));
-
-        BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1 || winner_index == 2);
-        if (winner_index == 0)
-            BOOST_TEST_EQ(std::get<int>(result_value), 1);
-        else if (winner_index == 1)
-            BOOST_TEST_EQ(std::get<std::string>(result_value), "hello");
-        else
-            BOOST_TEST_EQ(std::get<int>(result_value), 3);
-    }
-
-    // Test: Void task can win
-    void
-    testVoidTaskWins()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        std::size_t winner_index = 999;
-        std::variant<std::monostate, int> result_value;
-
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                winner_index = r.first;
-                result_value = r.second;
-            },
-            [](std::exception_ptr) {})(
-            when_any(void_task(), returns_int(42)));
-
-        BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
-        if (winner_index == 0)
-            BOOST_TEST(std::holds_alternative<std::monostate>(result_value));
-        else
-            BOOST_TEST_EQ(std::get<int>(result_value), 42);
-    }
-
-    // Test: All void tasks
-    void
-    testAllVoidTasks()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        std::size_t winner_index = 999;
-        std::variant<std::monostate> result_value;
-
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                winner_index = r.first;
-                result_value = r.second;
-            },
-            [](std::exception_ptr) {})(
-            when_any(void_task(), void_task(), void_task()));
-
-        BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1 || winner_index == 2);
-        // All void tasks produce monostate regardless of index
-        BOOST_TEST(std::holds_alternative<std::monostate>(result_value));
-    }
-
-    //----------------------------------------------------------
-    // Exception handling tests
-    //----------------------------------------------------------
-
-    // Test: Exception from single task propagates
-    void
-    testSingleTaskException()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        bool caught_exception = false;
-        std::string error_msg;
-
-        run_async(ex,
-            [&](auto&&) { completed = true; },
-            [&](std::exception_ptr ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    caught_exception = true;
-                    error_msg = e.what();
-                }
-            })(when_any(throws_exception("test error")));
-
-        BOOST_TEST(!completed);
-        BOOST_TEST(caught_exception);
-        BOOST_TEST_EQ(error_msg, "test error");
-    }
-
-    // Test: Exception wins the race (exception is a valid completion)
-    void
-    testExceptionWinsRace()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
-        std::string error_msg;
-
-        run_async(ex,
-            [](auto&&) {},
-            [&](std::exception_ptr ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    caught_exception = true;
-                    error_msg = e.what();
-                }
-            })(when_any(throws_exception("winner error"), returns_int(42)));
-
-        // With synchronous executor, first task (the thrower) wins
-        BOOST_TEST(caught_exception);
-        BOOST_TEST_EQ(error_msg, "winner error");
-    }
-
-    // Test: Void task exception
-    void
-    testVoidTaskException()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
-        std::string error_msg;
-
-        run_async(ex,
-            [](auto&&) {},
-            [&](std::exception_ptr ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    caught_exception = true;
-                    error_msg = e.what();
-                }
-            })(when_any(void_throws_exception("void error"), returns_int(42)));
-
-        BOOST_TEST(caught_exception);
-        BOOST_TEST_EQ(error_msg, "void error");
-    }
-
-    // Test: Multiple exceptions - first wins
-    void
-    testMultipleExceptionsFirstWins()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
-        std::string error_msg;
-
-        run_async(ex,
-            [](auto&&) {},
-            [&](std::exception_ptr ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    caught_exception = true;
-                    error_msg = e.what();
-                }
-            })(when_any(
-                throws_exception("error_1"),
-                throws_exception("error_2"),
-                throws_exception("error_3")));
-
-        BOOST_TEST(caught_exception);
-        // One of them wins
-        BOOST_TEST(
-            error_msg == "error_1" ||
-            error_msg == "error_2" ||
-            error_msg == "error_3");
-    }
-
-    //----------------------------------------------------------
-    // Stop token propagation tests
-    //----------------------------------------------------------
-
-    // Test: Stop is requested when winner completes
-    void
-    testStopRequestedOnCompletion()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        std::atomic<int> completion_count{0};
-        bool completed = false;
-
-        auto counting_task = [&]() -> task<int> {
-            ++completion_count;
-            co_return completion_count.load();
-        };
-
-        run_async(ex,
-            [&](auto&&) {
-                completed = true;
-            },
-            [](std::exception_ptr) {})(
-            when_any(counting_task(), counting_task(), counting_task()));
-
-        BOOST_TEST(completed);
-        // All three tasks should run to completion
-        // (stop is requested, but synchronous tasks complete anyway)
-        BOOST_TEST_EQ(completion_count.load(), 3);
-    }
-
-    // Test: All tasks complete even after winner (cleanup)
-    void
-    testAllTasksCompleteForCleanup()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        std::atomic<int> completion_count{0};
-        bool completed = false;
-
-        auto counting_task = [&](int value) -> task<int> {
-            ++completion_count;
-            co_return value;
-        };
-
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                // Winner should be first task (synchronous executor)
-                BOOST_TEST_EQ(r.first, 0u);
-            },
-            [](std::exception_ptr) {})(
-            when_any(
-                counting_task(1),
-                counting_task(2),
-                counting_task(3),
-                counting_task(4)));
-
-        BOOST_TEST(completed);
-        // All four tasks must complete for proper cleanup
-        BOOST_TEST_EQ(completion_count.load(), 4);
-    }
-
-    //----------------------------------------------------------
-    // Long-lived task cancellation tests
-    //----------------------------------------------------------
-
-    // Test: Long-lived tasks exit early when stop is requested
-    void
-    testLongLivedTasksCancelledOnWinner()
-    {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
-
-        std::atomic<int> cancelled_count{0};
-        std::atomic<int> completed_normally_count{0};
-        bool when_any_completed = false;
-        std::size_t winner_index = 999;
-        int winner_value = 0;
-
-        // A task that completes immediately
-        auto fast_task = [&]() -> task<int> {
-            ++completed_normally_count;
-            co_return 42;
-        };
-
-        // A task that does multiple steps, checking stop token between each
-        auto slow_task = [&](int id, int steps) -> task<int> {
-            for (int i = 0; i < steps; ++i) {
-                auto token = (co_await this_coro::environment)->stop_token;
-                if (token.stop_requested()) {
-                    ++cancelled_count;
-                    co_return -1;  // Cancelled
-                }
-                co_await yield_awaitable{};
-            }
-            ++completed_normally_count;
-            co_return id;
-        };
-
-        run_async(ex,
-            [&](auto&& r) {
-                when_any_completed = true;
-                winner_index = r.first;
-                winner_value = std::get<int>(r.second);
-            },
-            [](std::exception_ptr) {})(
-            when_any(fast_task(), slow_task(100, 10), slow_task(200, 10)));
-
-        // Process work queue until empty
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
-
-        BOOST_TEST(when_any_completed);
-        BOOST_TEST_EQ(winner_index, 0u);  // fast_task wins
-        BOOST_TEST_EQ(winner_value, 42);
-
-        // The fast task completed normally
-        BOOST_TEST_EQ(completed_normally_count.load(), 1);
-
-        // Both slow tasks should have been cancelled
-        BOOST_TEST_EQ(cancelled_count.load(), 2);
-    }
-
-    // Test: Slow task can win if it finishes first
-    void
-    testSlowTaskCanWin()
-    {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
-
-        std::atomic<int> cancelled_count{0};
-        std::atomic<int> completed_normally_count{0};
-        bool when_any_completed = false;
-        std::size_t winner_index = 999;
-        int winner_value = 0;
-
-        // A task that does a few steps then completes
-        auto medium_task = [&](int id, int steps) -> task<int> {
-            for (int i = 0; i < steps; ++i) {
-                auto token = (co_await this_coro::environment)->stop_token;
-                if (token.stop_requested()) {
-                    ++cancelled_count;
-                    co_return -1;
-                }
-                co_await yield_awaitable{};
-            }
-            ++completed_normally_count;
-            co_return id;
-        };
-
-        // Task 0: 3 steps, Task 1: 1 step (wins), Task 2: 4 steps
-        // With FIFO scheduling, task1 completes after 1 yield while others
-        // are still in progress and will observe the stop request.
-        run_async(ex,
-            [&](auto&& r) {
-                when_any_completed = true;
-                winner_index = r.first;
-                winner_value = std::get<int>(r.second);
-            },
-            [](std::exception_ptr) {})(
-            when_any(medium_task(10, 3), medium_task(20, 1), medium_task(30, 4)));
-
-        // Process work queue until empty
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
-
-        BOOST_TEST(when_any_completed);
-        BOOST_TEST_EQ(winner_index, 1u);  // Task with 1 step wins
-        BOOST_TEST_EQ(winner_value, 20);
-
-        // Only the winner completed normally
-        BOOST_TEST_EQ(completed_normally_count.load(), 1);
-
-        // Other two tasks were cancelled
-        BOOST_TEST_EQ(cancelled_count.load(), 2);
-    }
-
-    // Test: Tasks that don't check stop token still complete (cleanup)
-    void
-    testNonCooperativeTasksStillComplete()
-    {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
-
-        std::atomic<int> completion_count{0};
-        bool when_any_completed = false;
-
-        // A task that completes immediately
-        auto fast_task = [&]() -> task<int> {
-            ++completion_count;
-            co_return 42;
-        };
-
-        // A task that ignores stop token (non-cooperative)
-        auto non_cooperative_task = [&](int id, int steps) -> task<int> {
-            for (int i = 0; i < steps; ++i) {
-                // Deliberately NOT checking stop token
-                co_await yield_awaitable{};
-            }
-            ++completion_count;
-            co_return id;
-        };
-
-        run_async(ex,
-            [&](auto&& r) {
-                when_any_completed = true;
-                BOOST_TEST_EQ(r.first, 0u);  // fast_task wins
-            },
-            [](std::exception_ptr) {})(
-            when_any(fast_task(), non_cooperative_task(100, 3), non_cooperative_task(200, 3)));
-
-        // Process work queue until empty
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
-
-        BOOST_TEST(when_any_completed);
-
-        // All three tasks complete (non-cooperative tasks run to completion)
-        BOOST_TEST_EQ(completion_count.load(), 3);
-    }
-
-    // Test: Mixed cooperative and non-cooperative tasks
-    void
-    testMixedCooperativeAndNonCooperativeTasks()
-    {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
-
-        std::atomic<int> cooperative_cancelled{0};
-        std::atomic<int> non_cooperative_finished{0};
-        std::atomic<int> winner_finished{0};
-        bool when_any_completed = false;
-
-        auto fast_task = [&]() -> task<int> {
-            ++winner_finished;
-            co_return 1;
-        };
-
-        auto cooperative_slow = [&](int steps) -> task<int> {
-            for (int i = 0; i < steps; ++i) {
-                auto token = (co_await this_coro::environment)->stop_token;
-                if (token.stop_requested()) {
-                    ++cooperative_cancelled;
-                    co_return -1;
-                }
-                co_await yield_awaitable{};
-            }
-            co_return 2;
-        };
-
-        auto non_cooperative_slow = [&](int steps) -> task<int> {
-            for (int i = 0; i < steps; ++i) {
-                co_await yield_awaitable{};
-            }
-            ++non_cooperative_finished;
-            co_return 3;
-        };
-
-        run_async(ex,
-            [&](auto&& r) {
-                when_any_completed = true;
-                BOOST_TEST_EQ(r.first, 0u);
-            },
-            [](std::exception_ptr) {})(
-            when_any(fast_task(), cooperative_slow(5), non_cooperative_slow(5)));
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
-
-        BOOST_TEST(when_any_completed);
-        BOOST_TEST_EQ(winner_finished.load(), 1);
-        BOOST_TEST_EQ(cooperative_cancelled.load(), 1);
-        BOOST_TEST_EQ(non_cooperative_finished.load(), 1);
-    }
-
-    //----------------------------------------------------------
-    // Nested when_any tests
-    //----------------------------------------------------------
-
-    // Test: Nested when_any
-    void
-    testNestedWhenAny()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        int result = 0;
-
-        auto inner1 = []() -> task<int> {
-            auto [idx, res] = co_await when_any(returns_int(10), returns_int(20));
-            co_return std::get<int>(res);
-        };
-
-        auto inner2 = []() -> task<int> {
-            auto [idx, res] = co_await when_any(returns_int(30), returns_int(40));
-            co_return std::get<int>(res);
-        };
-
-        std::size_t winner_index = 999;
-
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                winner_index = r.first;
-                result = std::get<int>(r.second);
-            },
-            [](std::exception_ptr) {})(
-            when_any(inner1(), inner2()));
-
-        BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
-        // inner1 returns 10 or 20, inner2 returns 30 or 40
-        if (winner_index == 0)
-            BOOST_TEST(result == 10 || result == 20);
-        else
-            BOOST_TEST(result == 30 || result == 40);
-    }
-
-    // Test: when_any inside when_all
-    void
-    testWhenAnyInsideWhenAll()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-
-        auto race1 = []() -> task<int> {
-            auto [idx, res] = co_await when_any(returns_int(1), returns_int(2));
-            co_return std::get<int>(res);
-        };
-
-        auto race2 = []() -> task<int> {
-            auto [idx, res] = co_await when_any(returns_int(3), returns_int(4));
-            co_return std::get<int>(res);
-        };
+namespace {
 
-        run_async(ex,
-            [&](std::tuple<int, int> t) {
-                auto [a, b] = t;
-                completed = true;
-                BOOST_TEST((a == 1 || a == 2));
-                BOOST_TEST((b == 3 || b == 4));
-            },
-            [](std::exception_ptr) {})(
-            when_all(race1(), race2()));
+io_task<size_t>
+io_success_size(size_t n)
+{
+    co_return io_result<size_t>{{}, n};
+}
 
-        BOOST_TEST(completed);
-    }
+io_task<size_t>
+io_error_size(std::error_code ec, size_t n = 0)
+{
+    co_return io_result<size_t>{ec, n};
+}
 
-    // Test: when_all inside when_any
-    void
-    testWhenAllInsideWhenAny()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        std::size_t winner_index = 999;
-        int result_value = 0;
+io_task<std::string>
+io_success_string(std::string s)
+{
+    co_return io_result<std::string>{{}, std::move(s)};
+}
 
-        auto concurrent1 = []() -> task<int> {
-            auto [a, b] = co_await when_all(returns_int(1), returns_int(2));
-            co_return a + b;
-        };
+#if defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable : 4702) // unreachable code after throw
+#endif
 
-        auto concurrent2 = []() -> task<int> {
-            auto [a, b] = co_await when_all(returns_int(3), returns_int(4));
-            co_return a + b;
-        };
+io_task<size_t>
+io_throws_size(char const* msg)
+{
+    throw test_exception(msg);
+    co_return io_result<size_t>{{}, 0};
+}
 
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                winner_index = r.first;
-                result_value = std::get<int>(r.second);
-            },
-            [](std::exception_ptr) {})(
-            when_any(concurrent1(), concurrent2()));
+#if defined(_MSC_VER)
+#pragma warning(pop)
+#endif
 
-        BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
-        // concurrent1 returns 1+2=3, concurrent2 returns 3+4=7
-        if (winner_index == 0)
-            BOOST_TEST_EQ(result_value, 3);
-        else
-            BOOST_TEST_EQ(result_value, 7);
-    }
+// Suspends until stop token fires, then returns ECANCELED.
+io_task<size_t>
+io_pending_size()
+{
+    co_await stop_only_awaitable{};
+    co_return io_result<size_t>{make_error_code(error::canceled), 0};
+}
 
-    //----------------------------------------------------------
-    // Edge case tests
-    //----------------------------------------------------------
+// Awaitable that completes immediately (await_ready = true)
+// returning a successful io_result<size_t>.
+struct immediate_io_awaitable
+{
+    size_t n_;
 
-    // Test: Large number of tasks
-    void
-    testManyTasks()
+    explicit immediate_io_awaitable(size_t n) noexcept
+        : n_(n)
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        std::size_t winner_index = 999;
-        int result_value = 0;
-
-        run_async(ex,
-            [&](auto r) {
-                completed = true;
-                winner_index = r.first;
-                result_value = std::get<int>(r.second);
-            },
-            [](std::exception_ptr) {})(when_any(
-                returns_int(1), returns_int(2), returns_int(3), returns_int(4),
-                returns_int(5), returns_int(6), returns_int(7), returns_int(8)));
-
-        BOOST_TEST(completed);
-        BOOST_TEST(winner_index < 8);
-        // Verify correct index-to-value mapping (index 0 -> value 1, etc.)
-        BOOST_TEST_EQ(result_value, static_cast<int>(winner_index + 1));
     }
 
-    // Test: Task that does multiple internal operations
-    static task<int>
-    multi_step_task(int start)
-    {
-        int value = start;
-        value += co_await returns_int(1);
-        value += co_await returns_int(2);
-        co_return value;
-    }
+    bool await_ready() const noexcept { return true; }
 
-    void
-    testTasksWithMultipleSteps()
+    std::coroutine_handle<>
+    await_suspend(std::coroutine_handle<>, io_env const*)
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        std::size_t winner_index = 999;
-        int result_value = 0;
-
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                winner_index = r.first;
-                result_value = std::get<int>(r.second);
-            },
-            [](std::exception_ptr) {})(
-            when_any(multi_step_task(10), multi_step_task(20)));
-
-        BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
-        // Index 0: 10+1+2=13, Index 1: 20+1+2=23
-        if (winner_index == 0)
-            BOOST_TEST_EQ(result_value, 13);
-        else
-            BOOST_TEST_EQ(result_value, 23);
+        return std::noop_coroutine();
     }
 
-    //----------------------------------------------------------
-    // Awaitable lifecycle tests
-    //----------------------------------------------------------
-
-    // Test: when_any result is move constructible
-    void
-    testAwaitableMoveConstruction()
+    io_result<size_t> await_resume()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        std::size_t winner_index = 999;
-        int result_value = 0;
-
-        auto awaitable1 = when_any(returns_int(1), returns_int(2));
-        auto awaitable2 = std::move(awaitable1);
-
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                winner_index = r.first;
-                result_value = std::get<int>(r.second);
-            },
-            [](std::exception_ptr) {})(std::move(awaitable2));
-
-        BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
-        if (winner_index == 0)
-            BOOST_TEST_EQ(result_value, 1);
-        else
-            BOOST_TEST_EQ(result_value, 2);
+        return io_result<size_t>{{}, n_};
     }
+};
 
-    // Test: when_any can be stored and awaited later
-    void
-    testDeferredAwait()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        std::size_t winner_index = 999;
-        int result_value = 0;
-
-        auto deferred = when_any(returns_int(10), returns_int(20));
-
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                winner_index = r.first;
-                result_value = std::get<int>(r.second);
-            },
-            [](std::exception_ptr) {})(std::move(deferred));
+io_task<>
+io_void_ok()
+{
+    co_return io_result<>{};
+}
 
-        BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
-        if (winner_index == 0)
-            BOOST_TEST_EQ(result_value, 10);
-        else
-            BOOST_TEST_EQ(result_value, 20);
-    }
+io_task<>
+io_void_error(std::error_code ec)
+{
+    co_return io_result<>{ec};
+}
 
-    //----------------------------------------------------------
-    // Variant access tests
-    //----------------------------------------------------------
+} // anonymous namespace
 
-    // Test: Correct variant alternative is populated
+struct when_any_vector_test
+{
     void
-    testVariantAlternativePopulated()
+    testSingleTaskSuccess()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
 
-        // Note: <int, string, int> deduplicates to variant<int, string>
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_success_size(42));
+
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, std::pair<std::size_t, size_t>> v) {
                 completed = true;
-                // With synchronous executor, first task wins
-                BOOST_TEST_EQ(r.first, 0u);
-                BOOST_TEST(std::holds_alternative<int>(r.second));
-                BOOST_TEST_EQ(std::get<int>(r.second), 42);
+                BOOST_TEST_EQ(v.index(), 1u);
+                auto [idx, val] = std::get<1>(v);
+                BOOST_TEST_EQ(idx, 0u);
+                BOOST_TEST_EQ(val, 42u);
             },
             [](std::exception_ptr) {})(
-            when_any(returns_int(42), returns_string("hello"), returns_int(99)));
+            when_any(std::move(tasks)));
 
         BOOST_TEST(completed);
     }
 
-    // Test: Can use std::visit on result variant
     void
-    testVariantVisit()
+    testMultipleTasksFirstSuccessWins()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-        std::variant<int, std::string> result_value;
+
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_success_size(10));
+        tasks.push_back(io_success_size(20));
+        tasks.push_back(io_success_size(30));
 
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, std::pair<std::size_t, size_t>> v) {
                 completed = true;
-                winner_index = r.first;
-                result_value = r.second;
+                BOOST_TEST_EQ(v.index(), 1u);
+                auto [idx, val] = std::get<1>(v);
+                BOOST_TEST(idx < 3);
+                BOOST_TEST_EQ(val, (idx + 1) * 10);
             },
             [](std::exception_ptr) {})(
-            when_any(returns_int(42), returns_string("hello")));
+            when_any(std::move(tasks)));
 
         BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
-        if (winner_index == 0)
-            BOOST_TEST_EQ(std::get<int>(result_value), 42);
-        else
-            BOOST_TEST_EQ(std::get<std::string>(result_value), "hello");
-    }
-
-    //----------------------------------------------------------
-    // Parent stop token propagation tests
-    //----------------------------------------------------------
-
-    // Test: Parent stop token already requested before when_any starts
-    void
-    testParentStopAlreadyRequested()
-    {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
-
-        std::atomic<int> saw_stop_count{0};
-        bool when_any_completed = false;
-        std::size_t winner_index = 999;
-
-        // A task that checks stop token on first suspension
-        auto check_stop_task = [&](int id) -> task<int> {
-            auto token = (co_await this_coro::environment)->stop_token;
-            if (token.stop_requested()) {
-                ++saw_stop_count;
-            }
-            co_return id;
-        };
-
-        // Use a stop_source to simulate parent cancellation
-        std::stop_source parent_stop;
-        parent_stop.request_stop();
-
-        // Use run_async with stop_token parameter to test propagation
-        run_async(ex, parent_stop.get_token(),
-            [&](auto&& r) {
-                when_any_completed = true;
-                winner_index = r.first;
-            },
-            [](std::exception_ptr) {})(
-            when_any(check_stop_task(1), check_stop_task(2), check_stop_task(3)));
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
-
-        BOOST_TEST(when_any_completed);
-        // All tasks should have seen the stop token as requested
-        // (inherited from parent)
-        BOOST_TEST_EQ(saw_stop_count.load(), 3);
-    }
-
-    // Test: Parent stop requested after tasks start but before winner
-    void
-    testParentStopDuringExecution()
-    {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
-
-        std::atomic<int> cancelled_count{0};
-        bool when_any_completed = false;
-
-        auto slow_task = [&](int id, int steps) -> task<int> {
-            for (int i = 0; i < steps; ++i) {
-                auto token = (co_await this_coro::environment)->stop_token;
-                if (token.stop_requested()) {
-                    ++cancelled_count;
-                    co_return -1;
-                }
-                co_await yield_awaitable{};
-            }
-            co_return id;
-        };
-
-        std::stop_source parent_stop;
-
-        // Use run_async with stop_token parameter
-        run_async(ex, parent_stop.get_token(),
-            [&](auto&&) {
-                when_any_completed = true;
-            },
-            [](std::exception_ptr) {})(
-            when_any(slow_task(1, 10), slow_task(2, 10)));
-
-        // Run a few iterations, then request parent stop
-        for (int i = 0; i < 3 && !work_queue.empty(); ++i) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
-
-        // Request stop from parent
-        parent_stop.request_stop();
-
-        // Finish processing
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
-
-        BOOST_TEST(when_any_completed);
-        // Both tasks should have been cancelled by parent stop
-        BOOST_TEST_EQ(cancelled_count.load(), 2);
     }
 
-    //----------------------------------------------------------
-    // Interleaved exception tests
-    //----------------------------------------------------------
-
-    // Test: Multiple exceptions thrown with interleaved execution
     void
-    testInterleavedExceptions()
+    testEmptyVectorThrows()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
+        bool caught = false;
 
-        bool caught_exception = false;
-        std::string error_msg;
-
-        // Tasks that yield before throwing
-        auto delayed_throw = [](int id, int yields) -> task<int> {
-            for (int i = 0; i < yields; ++i) {
-                co_await yield_awaitable{};
-            }
-            throw test_exception(("error_" + std::to_string(id)).c_str());
-            co_return id;
-        };
+        std::vector<io_task<size_t>> tasks;
 
         run_async(ex,
-            [](auto&&) {},
+            [](std::variant<std::error_code, std::pair<std::size_t, size_t>>) {},
             [&](std::exception_ptr ep) {
                 try {
                     std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    caught_exception = true;
-                    error_msg = e.what();
-                }
-            })(when_any(delayed_throw(1, 2), delayed_throw(2, 1), delayed_throw(3, 3)));
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
-
-        BOOST_TEST(caught_exception);
-        // Task 2 throws first (after 1 yield)
-        BOOST_TEST_EQ(error_msg, "error_2");
-    }
-
-    //----------------------------------------------------------
-    // Nested stop propagation tests
-    //----------------------------------------------------------
-
-    // Test: Stop propagates through nested when_any - outer task cancelled before inner starts
-    void
-    testNestedStopPropagationOuterCancelled()
-    {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
-
-        std::atomic<int> outer_cancelled{0};
-        bool when_any_completed = false;
-        std::size_t winner_index = 999;
-
-        auto fast_task = [&]() -> task<int> {
-            co_return 42;
-        };
-
-        // A task that checks stop before launching inner when_any
-        auto nested_when_any_task = [&]() -> task<int> {
-            auto token = (co_await this_coro::environment)->stop_token;
-            if (token.stop_requested()) {
-                ++outer_cancelled;
-                co_return -1;
-            }
-            // Won't reach here if stopped
-            co_return 100;
-        };
-
-        run_async(ex,
-            [&](auto&& r) {
-                when_any_completed = true;
-                winner_index = r.first;
-            },
-            [](std::exception_ptr) {})(
-            when_any(fast_task(), nested_when_any_task()));
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
-
-        BOOST_TEST(when_any_completed);
-        BOOST_TEST_EQ(winner_index, 0u);  // fast_task wins
-        // The nested task should see stop and exit early
-        BOOST_TEST_EQ(outer_cancelled.load(), 1);
-    }
-
-    // Test: Stop propagates to inner when_any's children
-    void
-    testNestedStopPropagationInnerCancelled()
-    {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
-
-        std::atomic<int> inner_cancelled{0};
-        std::atomic<int> inner_completed{0};
-        bool when_any_completed = false;
-        std::size_t winner_index = 999;
-
-        // Fast task that yields first to let nested when_any start
-        auto yielding_fast_task = [&]() -> task<int> {
-            co_await yield_awaitable{};
-            co_return 42;
-        };
-
-        auto slow_inner_task = [&](int steps) -> task<int> {
-            for (int i = 0; i < steps; ++i) {
-                auto token = (co_await this_coro::environment)->stop_token;
-                if (token.stop_requested()) {
-                    ++inner_cancelled;
-                    co_return -1;
-                }
-                co_await yield_awaitable{};
-            }
-            ++inner_completed;
-            co_return 100;
-        };
-
-        // A task containing a nested when_any - doesn't check stop first
-        auto nested_when_any_task = [&]() -> task<int> {
-            // Start inner when_any immediately (no stop check first)
-            auto [idx, res] = co_await when_any(
-                slow_inner_task(10),
-                slow_inner_task(10));
-            co_return std::get<int>(res);
-        };
-
-        run_async(ex,
-            [&](auto&& r) {
-                when_any_completed = true;
-                winner_index = r.first;
-            },
-            [](std::exception_ptr) {})(
-            when_any(yielding_fast_task(), nested_when_any_task()));
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
-
-        BOOST_TEST(when_any_completed);
-        // One of them should win
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
-
-        if (winner_index == 0) {
-            // If yielding_fast_task won, the inner tasks should be cancelled
-            BOOST_TEST_EQ(inner_cancelled.load(), 2);
-            BOOST_TEST_EQ(inner_completed.load(), 0);
-        } else {
-            // If nested_when_any_task won (one of its inner tasks completed)
-            // one inner task completes, other gets cancelled
-            BOOST_TEST_EQ(inner_completed.load(), 1);
-            BOOST_TEST_EQ(inner_cancelled.load(), 1);
-        }
-    }
-
-    //----------------------------------------------------------
-    // Variant usage pattern tests
-    //----------------------------------------------------------
-
-    // Test: Document correct pattern for variant access based on index
-    void
-    testVariantAccessByIndex()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        bool correct_access = false;
-
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                // The correct pattern: use index to determine which type to access
-                switch (r.first) {
-                    case 0:
-                        correct_access = std::holds_alternative<int>(r.second);
-                        BOOST_TEST_EQ(std::get<int>(r.second), 42);
-                        break;
-                    case 1:
-                        correct_access = std::holds_alternative<std::string>(r.second);
-                        BOOST_TEST_EQ(std::get<std::string>(r.second), "hello");
-                        break;
-                    case 2:
-                        correct_access = std::holds_alternative<double>(r.second);
-                        BOOST_TEST_EQ(std::get<double>(r.second), 3.14);
-                        break;
+                } catch (std::invalid_argument const&) {
+                    caught = true;
                 }
-            },
-            [](std::exception_ptr) {})(
-            when_any(returns_int(42), returns_string("hello"), []() -> task<double> { co_return 3.14; }()));
+            })(when_any(std::move(tasks)));
 
-        BOOST_TEST(completed);
-        BOOST_TEST(correct_access);
+        BOOST_TEST(caught);
     }
 
-    // Test: Variant with duplicate types - index disambiguation
     void
-    testVariantDuplicateTypesIndexDisambiguation()
+    testVoidTasksSuccess()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-        int result_value = 0;
-
-        // when_any(int, int, int) deduplicates to variant<int>
-        // but winner_index tells us WHICH task won
-        run_async(ex,
-            [&](auto&& r) {
-                completed = true;
-                winner_index = r.first;
-                result_value = std::get<int>(r.second);
-            },
-            [](std::exception_ptr) {})(
-            when_any(returns_int(100), returns_int(200), returns_int(300)));
-
-        BOOST_TEST(completed);
-        // With synchronous executor, first task wins
-        BOOST_TEST_EQ(winner_index, 0u);
-        BOOST_TEST_EQ(result_value, 100);
-    }
-
-    void
-    run()
-    {
-        // Basic functionality
-        testSingleTask();
-        testTwoTasksFirstWins();
-        testMixedTypes();
-        testVoidTaskWins();
-        testAllVoidTasks();
 
-        // Exception handling
-        testSingleTaskException();
-        testExceptionWinsRace();
-        testVoidTaskException();
-        testMultipleExceptionsFirstWins();
-
-        // Stop token propagation
-        testStopRequestedOnCompletion();
-        testAllTasksCompleteForCleanup();
-
-        // Parent stop token propagation
-        testParentStopAlreadyRequested();
-        testParentStopDuringExecution();
-
-        // Long-lived task cancellation
-        testLongLivedTasksCancelledOnWinner();
-        testSlowTaskCanWin();
-        testNonCooperativeTasksStillComplete();
-        testMixedCooperativeAndNonCooperativeTasks();
-
-        // Interleaved exceptions
-        testInterleavedExceptions();
-
-        // Nested combinators
-        testNestedWhenAny();
-        testWhenAnyInsideWhenAll();
-        testWhenAllInsideWhenAny();
-
-        // Nested stop propagation
-        testNestedStopPropagationOuterCancelled();
-        testNestedStopPropagationInnerCancelled();
-
-        // Edge cases
-        testManyTasks();
-        testTasksWithMultipleSteps();
-
-        // Awaitable lifecycle
-        testAwaitableMoveConstruction();
-        testDeferredAwait();
-
-        // Variant access
-        testVariantAlternativePopulated();
-        testVariantVisit();
-        testVariantAccessByIndex();
-        testVariantDuplicateTypesIndexDisambiguation();
-    }
-};
-
-TEST_SUITE(
-    when_any_test,
-    "boost.capy.when_any");
+        std::vector<io_task<>> tasks;
+        tasks.push_back(io_void_ok());
+        tasks.push_back(io_void_ok());
+        tasks.push_back(io_void_ok());
 
-//----------------------------------------------------------
-// Homogeneous when_any tests (vector overload)
-//----------------------------------------------------------
+        run_async(ex,
+            [&](std::variant<std::error_code, std::size_t> v) {
+                completed = true;
+                BOOST_TEST_EQ(v.index(), 1u);
+                BOOST_TEST(std::get<1>(v) < 3);
+            },
+            [](std::exception_ptr) {})(
+            when_any(std::move(tasks)));
 
-struct when_any_vector_test
-{
-    //----------------------------------------------------------
-    // Basic functionality tests
-    //----------------------------------------------------------
+        BOOST_TEST(completed);
+    }
 
-    // Test: Single task in vector
     void
-    testSingleTaskVector()
+    testErrorDoesNotWin()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        int result = 0;
-        std::size_t winner_index = 999;
 
-        std::vector<task<int>> tasks;
-        tasks.push_back(returns_int(42));
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_error_size(make_error_code(error::eof)));
+        tasks.push_back(io_success_size(100));
 
         run_async(ex,
-            [&](std::pair<std::size_t, int> r) {
+            [&](std::variant<std::error_code, std::pair<std::size_t, size_t>> v) {
                 completed = true;
-                winner_index = r.first;
-                result = r.second;
+                BOOST_TEST_EQ(v.index(), 1u);
+                auto [idx, val] = std::get<1>(v);
+                BOOST_TEST_EQ(idx, 1u);
+                BOOST_TEST_EQ(val, 100u);
             },
             [](std::exception_ptr) {})(
             when_any(std::move(tasks)));
 
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(winner_index, 0u);
-        BOOST_TEST_EQ(result, 42);
     }
 
-    // Test: Multiple tasks in vector
     void
-    testMultipleTasksVector()
+    testAllFailReturnsError()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-        int result_value = 0;
 
-        std::vector<task<int>> tasks;
-        tasks.push_back(returns_int(10));
-        tasks.push_back(returns_int(20));
-        tasks.push_back(returns_int(30));
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_error_size(make_error_code(error::eof)));
+        tasks.push_back(io_error_size(make_error_code(error::timeout)));
 
         run_async(ex,
-            [&](std::pair<std::size_t, int> r) {
+            [&](std::variant<std::error_code, std::pair<std::size_t, size_t>> v) {
                 completed = true;
-                winner_index = r.first;
-                result_value = r.second;
+                BOOST_TEST_EQ(v.index(), 0u);
+                BOOST_TEST(!!std::get<0>(v));
             },
             [](std::exception_ptr) {})(
             when_any(std::move(tasks)));
 
         BOOST_TEST(completed);
-        BOOST_TEST(winner_index < 3);
-        // Verify correct index-to-value mapping
-        BOOST_TEST_EQ(result_value, static_cast<int>((winner_index + 1) * 10));
     }
 
-    // Test: Empty vector throws
     void
-    testEmptyVectorThrows()
+    testAllThrowRethrows()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+        bool caught = false;
+        std::string msg;
 
-        std::vector<task<int>> tasks;
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_throws_size("first"));
+        tasks.push_back(io_throws_size("second"));
 
         run_async(ex,
-            [](std::pair<std::size_t, int>) {},
+            [&](std::variant<std::error_code, std::pair<std::size_t, size_t>>) {
+                completed = true;
+            },
             [&](std::exception_ptr ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (std::invalid_argument const&) {
-                    caught_exception = true;
+                try { std::rethrow_exception(ep); }
+                catch (test_exception const& e) {
+                    caught = true;
+                    msg = e.what();
                 }
             })(when_any(std::move(tasks)));
 
-        BOOST_TEST(caught_exception);
+        BOOST_TEST(!completed);
+        BOOST_TEST(caught);
+        BOOST_TEST_EQ(msg, "second");
     }
 
-    // Test: Void tasks in vector
     void
-    testVoidTasksVector()
+    testExceptionDoesNotWin()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
 
-        std::vector<task<void>> tasks;
-        tasks.push_back(void_task());
-        tasks.push_back(void_task());
-        tasks.push_back(void_task());
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_throws_size("boom"));
+        tasks.push_back(io_success_size(55));
 
         run_async(ex,
-            [&](std::size_t idx) {
+            [&](std::variant<std::error_code, std::pair<std::size_t, size_t>> v) {
                 completed = true;
-                winner_index = idx;
+                BOOST_TEST_EQ(v.index(), 1u);
+                auto [idx, val] = std::get<1>(v);
+                BOOST_TEST_EQ(idx, 1u);
+                BOOST_TEST_EQ(val, 55u);
             },
             [](std::exception_ptr) {})(
             when_any(std::move(tasks)));
 
         BOOST_TEST(completed);
-        BOOST_TEST(winner_index < 3);
-    }
-
-    //----------------------------------------------------------
-    // Exception handling tests
-    //----------------------------------------------------------
-
-    // Test: Exception from task in vector
-    void
-    testExceptionInVector()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
-        std::string error_msg;
-
-        std::vector<task<int>> tasks;
-        tasks.push_back(throws_exception("vector error"));
-
-        run_async(ex,
-            [](std::pair<std::size_t, int>) {},
-            [&](std::exception_ptr ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    caught_exception = true;
-                    error_msg = e.what();
-                }
-            })(when_any(std::move(tasks)));
-
-        BOOST_TEST(caught_exception);
-        BOOST_TEST_EQ(error_msg, "vector error");
     }
 
-    // Test: Exception wins race in vector
+    // Last failure wins. Error child runs last, so error is reported.
     void
-    testExceptionWinsRaceVector()
+    testLastFailureWins()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
-        std::string error_msg;
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
+        std::error_code result_ec;
 
-        std::vector<task<int>> tasks;
-        tasks.push_back(throws_exception("winner"));
-        tasks.push_back(returns_int(42));
-        tasks.push_back(returns_int(99));
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(io_throws_size("exception"));
+        tasks.push_back(io_error_size(make_error_code(error::eof)));
 
         run_async(ex,
-            [](std::pair<std::size_t, int>) {},
-            [&](std::exception_ptr ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    caught_exception = true;
-                    error_msg = e.what();
-                }
-            })(when_any(std::move(tasks)));
+            [&](std::variant<std::error_code, std::pair<std::size_t, size_t>> v) {
+                completed = true;
+                BOOST_TEST_EQ(v.index(), 0u);
+                result_ec = std::get<0>(v);
+            },
+            [](std::exception_ptr) {})(
+            when_any(std::move(tasks)));
 
-        BOOST_TEST(caught_exception);
-        BOOST_TEST_EQ(error_msg, "winner");
+        BOOST_TEST(completed);
+        BOOST_TEST(result_ec == cond::eof);
     }
 
-    // Test: Void task exception in vector
     void
-    testVoidExceptionInVector()
+    testVoidErrorDoesNotWin()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
-        std::string error_msg;
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
 
-        std::vector<task<void>> tasks;
-        tasks.push_back(void_throws_exception("void vector error"));
-        tasks.push_back(void_task());
+        std::vector<io_task<>> tasks;
+        tasks.push_back(io_void_error(make_error_code(error::eof)));
+        tasks.push_back(io_void_ok());
 
         run_async(ex,
-            [](std::size_t) {},
-            [&](std::exception_ptr ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (test_exception const& e) {
-                    caught_exception = true;
-                    error_msg = e.what();
-                }
-            })(when_any(std::move(tasks)));
+            [&](std::variant<std::error_code, std::size_t> v) {
+                completed = true;
+                BOOST_TEST_EQ(v.index(), 1u);
+                BOOST_TEST_EQ(std::get<1>(v), 1u);
+            },
+            [](std::exception_ptr) {})(
+            when_any(std::move(tasks)));
 
-        BOOST_TEST(caught_exception);
-        BOOST_TEST_EQ(error_msg, "void vector error");
+        BOOST_TEST(completed);
     }
 
-    //----------------------------------------------------------
-    // Stop token propagation tests
-    //----------------------------------------------------------
-
-    // Test: All tasks complete for cleanup (vector)
     void
-    testAllTasksCompleteForCleanupVector()
+    testAllTasksCompleteForCleanup()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         std::atomic<int> completion_count{0};
         bool completed = false;
 
-        auto counting_task = [&](int value) -> task<int> {
+        auto counting = [&](size_t value) -> io_task<size_t> {
             ++completion_count;
-            co_return value;
+            co_return io_result<size_t>{{}, value};
         };
 
-        std::vector<task<int>> tasks;
-        tasks.push_back(counting_task(1));
-        tasks.push_back(counting_task(2));
-        tasks.push_back(counting_task(3));
-        tasks.push_back(counting_task(4));
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(counting(1));
+        tasks.push_back(counting(2));
+        tasks.push_back(counting(3));
+        tasks.push_back(counting(4));
 
         run_async(ex,
-            [&](std::pair<std::size_t, int>) {
+            [&](std::variant<std::error_code, std::pair<std::size_t, size_t>>) {
                 completed = true;
             },
             [](std::exception_ptr) {})(
             when_any(std::move(tasks)));
 
         BOOST_TEST(completed);
-        // All four tasks must complete for proper cleanup
         BOOST_TEST_EQ(completion_count.load(), 4);
     }
 
-    //----------------------------------------------------------
-    // Long-lived task cancellation tests (vector)
-    //----------------------------------------------------------
-
-    // Test: Long-lived tasks cancelled on winner (vector)
     void
-    testLongLivedTasksCancelledVector()
+    testLongLivedTasksCancelled()
     {
         std::queue<std::coroutine_handle<>> work_queue;
         queuing_executor ex(work_queue);
@@ -1512,37 +404,37 @@ struct when_any_vector_test
         std::atomic<int> cancelled_count{0};
         std::atomic<int> completed_normally_count{0};
         bool when_any_completed = false;
-        std::size_t winner_index = 999;
-        int winner_value = 0;
 
-        auto fast_task = [&]() -> task<int> {
+        auto fast = [&]() -> io_task<size_t> {
             ++completed_normally_count;
-            co_return 42;
+            co_return io_result<size_t>{{}, 42};
         };
 
-        auto slow_task = [&](int id, int steps) -> task<int> {
+        auto slow = [&](size_t id, int steps) -> io_task<size_t> {
             for (int i = 0; i < steps; ++i) {
                 auto token = (co_await this_coro::environment)->stop_token;
                 if (token.stop_requested()) {
                     ++cancelled_count;
-                    co_return -1;
+                    co_return io_result<size_t>{make_error_code(error::canceled), 0};
                 }
                 co_await yield_awaitable{};
             }
             ++completed_normally_count;
-            co_return id;
+            co_return io_result<size_t>{{}, id};
         };
 
-        std::vector<task<int>> tasks;
-        tasks.push_back(fast_task());
-        tasks.push_back(slow_task(100, 10));
-        tasks.push_back(slow_task(200, 10));
+        std::vector<io_task<size_t>> tasks;
+        tasks.push_back(fast());
+        tasks.push_back(slow(100, 10));
+        tasks.push_back(slow(200, 10));
 
         run_async(ex,
-            [&](std::pair<std::size_t, int> r) {
+            [&](std::variant<std::error_code, std::pair<std::size_t, size_t>> v) {
                 when_any_completed = true;
-                winner_index = r.first;
-                winner_value = r.second;
+                BOOST_TEST_EQ(v.index(), 1u);
+                auto [idx, val] = std::get<1>(v);
+                BOOST_TEST_EQ(idx, 0u);
+                BOOST_TEST_EQ(val, 42u);
             },
             [](std::exception_ptr) {})(
             when_any(std::move(tasks)));
@@ -1554,74 +446,62 @@ struct when_any_vector_test
         }
 
         BOOST_TEST(when_any_completed);
-        BOOST_TEST_EQ(winner_index, 0u);
-        BOOST_TEST_EQ(winner_value, 42);
         BOOST_TEST_EQ(completed_normally_count.load(), 1);
         BOOST_TEST_EQ(cancelled_count.load(), 2);
     }
 
-    //----------------------------------------------------------
-    // Large vector tests
-    //----------------------------------------------------------
-
-    // Test: Many tasks in vector
     void
-    testManyTasksVector()
+    testManyTasks()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-        int result_value = 0;
 
-        std::vector<task<int>> tasks;
-        for (int i = 1; i <= 20; ++i)
-            tasks.push_back(returns_int(i));
+        std::vector<io_task<size_t>> tasks;
+        for (size_t i = 1; i <= 20; ++i)
+            tasks.push_back(io_success_size(i));
 
         run_async(ex,
-            [&](std::pair<std::size_t, int> r) {
+            [&](std::variant<std::error_code, std::pair<std::size_t, size_t>> v) {
                 completed = true;
-                winner_index = r.first;
-                result_value = r.second;
+                BOOST_TEST_EQ(v.index(), 1u);
+                auto [idx, val] = std::get<1>(v);
+                BOOST_TEST(idx < 20);
+                BOOST_TEST_EQ(val, idx + 1);
             },
             [](std::exception_ptr) {})(
             when_any(std::move(tasks)));
 
         BOOST_TEST(completed);
-        BOOST_TEST(winner_index < 20);
-        // Verify correct index-to-value mapping (index 0 -> value 1, etc.)
-        BOOST_TEST_EQ(result_value, static_cast<int>(winner_index + 1));
     }
 
-    //----------------------------------------------------------
-    // Nested combinator tests
-    //----------------------------------------------------------
-
-    // Test: Nested when_any with vectors
     void
-    testNestedWhenAnyVector()
+    testNestedWhenAny()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        int result = 0;
-
-        auto inner = []() -> task<int> {
-            std::vector<task<int>> tasks;
-            tasks.push_back(returns_int(10));
-            tasks.push_back(returns_int(20));
-            auto [idx, res] = co_await when_any(std::move(tasks));
-            co_return res;
+        size_t result = 0;
+
+        auto inner = []() -> io_task<size_t> {
+            std::vector<io_task<size_t>> tasks;
+            tasks.push_back(io_success_size(10));
+            tasks.push_back(io_success_size(20));
+            auto v = co_await when_any(std::move(tasks));
+            if(v.index() == 1)
+                co_return io_result<size_t>{{}, std::get<1>(v).second};
+            co_return io_result<size_t>{std::get<0>(v), 0};
         };
 
-        std::vector<task<int>> outer_tasks;
+        std::vector<io_task<size_t>> outer_tasks;
         outer_tasks.push_back(inner());
         outer_tasks.push_back(inner());
 
         run_async(ex,
-            [&](std::pair<std::size_t, int> r) {
+            [&](std::variant<std::error_code, std::pair<std::size_t, size_t>> v) {
                 completed = true;
-                result = r.second;
+                BOOST_TEST_EQ(v.index(), 1u);
+                result = std::get<1>(v).second;
             },
             [](std::exception_ptr) {})(
             when_any(std::move(outer_tasks)));
@@ -1630,106 +510,26 @@ struct when_any_vector_test
         BOOST_TEST(result == 10 || result == 20);
     }
 
-    // Test: when_any vector inside when_all
-    void
-    testWhenAnyVectorInsideWhenAll()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-
-        auto race = []() -> task<int> {
-            std::vector<task<int>> tasks;
-            tasks.push_back(returns_int(1));
-            tasks.push_back(returns_int(2));
-            auto [idx, res] = co_await when_any(std::move(tasks));
-            co_return res;
-        };
-
-        run_async(ex,
-            [&](std::tuple<int, int> t) {
-                auto [a, b] = t;
-                completed = true;
-                BOOST_TEST((a == 1 || a == 2));
-                BOOST_TEST((b == 1 || b == 2));
-            },
-            [](std::exception_ptr) {})(
-            when_all(race(), race()));
-
-        BOOST_TEST(completed);
-    }
-
-    //----------------------------------------------------------
-    // Mixed variadic and vector tests
-    //----------------------------------------------------------
-
-    // Test: Mix variadic and vector when_any
-    void
-    testMixedVariadicAndVector()
-    {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool completed = false;
-        std::size_t outer_winner = 999;
-
-        auto variadic_race = []() -> task<int> {
-            auto [idx, res] = co_await when_any(returns_int(1), returns_int(2));
-            co_return std::get<int>(res);
-        };
-
-        auto vector_race = []() -> task<int> {
-            std::vector<task<int>> tasks;
-            tasks.push_back(returns_int(3));
-            tasks.push_back(returns_int(4));
-            auto [idx, res] = co_await when_any(std::move(tasks));
-            co_return res;
-        };
-
-        run_async(ex,
-            [&](auto r) {
-                completed = true;
-                outer_winner = r.first;
-                auto result = std::get<int>(r.second);
-                if (outer_winner == 0)
-                    BOOST_TEST((result == 1 || result == 2));
-                else
-                    BOOST_TEST((result == 3 || result == 4));
-            },
-            [](std::exception_ptr) {})(
-            when_any(variadic_race(), vector_race()));
-
-        BOOST_TEST(completed);
-    }
-
     void
     run()
     {
-        // Basic functionality
-        testSingleTaskVector();
-        testMultipleTasksVector();
+        testSingleTaskSuccess();
+        testMultipleTasksFirstSuccessWins();
         testEmptyVectorThrows();
-        testVoidTasksVector();
+        testVoidTasksSuccess();
 
-        // Exception handling
-        testExceptionInVector();
-        testExceptionWinsRaceVector();
-        testVoidExceptionInVector();
+        testErrorDoesNotWin();
+        testAllFailReturnsError();
+        testAllThrowRethrows();
+        testExceptionDoesNotWin();
+        testLastFailureWins();
+        testVoidErrorDoesNotWin();
 
-        // Stop token propagation
-        testAllTasksCompleteForCleanupVector();
-
-        // Long-lived task cancellation
-        testLongLivedTasksCancelledVector();
-
-        // Large vectors
-        testManyTasksVector();
-
-        // Nested combinators
-        testNestedWhenAnyVector();
-        testWhenAnyVectorInsideWhenAll();
+        testAllTasksCompleteForCleanup();
+        testLongLivedTasksCancelled();
 
-        // Mixed variadic and vector
-        testMixedVariadicAndVector();
+        testManyTasks();
+        testNestedWhenAny();
     }
 };
 
@@ -1737,520 +537,382 @@ TEST_SUITE(
     when_any_vector_test,
     "boost.capy.when_any_vector");
 
-//----------------------------------------------------------
-// IoAwaitable (non-task) tests for when_any
-//----------------------------------------------------------
-
-struct when_any_io_awaitable_test
+// Tests for io_result-aware when_any behavior per the combinators spec.
+// Each test is labelled with the spec row it verifies.
+struct when_any_io_result_test
 {
-    // Test: when_any with stop_only_awaitable (void IoAwaitable) and task<int>
+    // Spec Row 1: First task to return !ec
+    // Wins. Cancel siblings. Return winner's result.
     void
-    testStopOnlyAwaitableWithTask()
+    testFirstSuccessWins()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
+        size_t winner_index = 999;
+        size_t result = 0;
 
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, size_t, size_t> v) {
                 completed = true;
-                winner_index = r.first;
+                winner_index = v.index();
+                if(v.index() == 1)
+                    result = std::get<1>(v);
             },
             [](std::exception_ptr) {})(
-            when_any(stop_only_awaitable{}, returns_int(42)));
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            when_any(
+                io_success_size(42),
+                io_pending_size()));
 
         BOOST_TEST(completed);
-        // task<int> completes immediately, stop_only_awaitable wakes via stop
+        // Child 0 succeeded -> variant at index 1
         BOOST_TEST_EQ(winner_index, 1u);
+        BOOST_TEST_EQ(result, 42u);
     }
 
-    // Test: when_any with async_event wait_awaiter and task<int>
+    // Spec Row 1 (single child)
     void
-    testAsyncEventWaitWithTask()
+    testSingleTaskSuccess()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-
-        async_event event;
 
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, size_t> v) {
                 completed = true;
-                winner_index = r.first;
+                BOOST_TEST_EQ(v.index(), 1u);
+                BOOST_TEST_EQ(std::get<1>(v), 99u);
             },
             [](std::exception_ptr) {})(
-            when_any(event.wait(), returns_int(42)));
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            when_any(io_success_size(99)));
 
         BOOST_TEST(completed);
-        // task<int> completes first, event.wait() cancelled via stop token
-        BOOST_TEST_EQ(winner_index, 1u);
     }
 
-    // Test: when_any with two stop_only_awaitables (homogeneous non-task)
+    // Spec Row 2: One task returns ec, others pending
+    // Does not win. Keep waiting.
     void
-    testTwoStopOnlyAwaitables()
+    testErrorDoesNotWin()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
+        size_t winner_index = 999;
 
-        // Use a stop_source to cancel from parent
-        std::stop_source parent_stop;
-
-        run_async(ex, parent_stop.get_token(),
-            [&](auto&& r) {
+        run_async(ex,
+            [&](std::variant<std::error_code, size_t, size_t> v) {
                 completed = true;
-                winner_index = r.first;
+                winner_index = v.index();
+                if(v.index() == 2)
+                    BOOST_TEST_EQ(std::get<2>(v), 100u);
             },
             [](std::exception_ptr) {})(
-            when_any(stop_only_awaitable{}, stop_only_awaitable{}));
-
-        // Neither can complete on their own - request parent stop
-        parent_stop.request_stop();
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            when_any(
+                io_error_size(make_error_code(error::eof)),
+                io_success_size(100)));
 
         BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
+        // Child 0 failed, child 1 won -> index 2
+        BOOST_TEST_EQ(winner_index, 2u);
     }
 
-    // Test: when_any with io_task<> (task<io_result<>>)
+    // Spec Row 3: One succeeds, one already failed
+    // Successful task wins.
     void
-    testIoTaskWithWhenAny()
+    testSuccessAfterFailure()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-
-        auto io_op = []() -> io_task<> {
-            co_return io_result<>{{}};
-        };
+        size_t winner_index = 999;
+        size_t result = 0;
 
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, size_t, size_t> v) {
                 completed = true;
-                winner_index = r.first;
+                winner_index = v.index();
+                if(v.index() == 2)
+                    result = std::get<2>(v);
             },
             [](std::exception_ptr) {})(
-            when_any(io_op(), io_op()));
+            when_any(
+                io_error_size(make_error_code(error::eof)),
+                io_success_size(77)));
 
         BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
+        BOOST_TEST_EQ(winner_index, 2u);
+        BOOST_TEST_EQ(result, 77u);
     }
 
-    // Test: when_any with io_task<size_t> (task<io_result<size_t>>)
+    // Spec Row 4: All tasks return ec (all fail)
+    // No winner. Variant holds error_code at index 0.
     void
-    testIoTaskWithValueAndWhenAny()
+    testAllFail()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-        io_result<std::size_t> result;
-
-        auto io_read = [](std::size_t n) -> io_task<std::size_t> {
-            co_return io_result<std::size_t>{{}, n};
-        };
 
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, size_t, size_t> v) {
                 completed = true;
-                winner_index = r.first;
-                result = std::get<io_result<std::size_t>>(r.second);
+                BOOST_TEST_EQ(v.index(), 0u);
+                auto ec = std::get<0>(v);
+                // Spec: which child's ec is unspecified
+                BOOST_TEST(!!ec);
             },
             [](std::exception_ptr) {})(
-            when_any(io_read(100), io_read(200)));
+            when_any(
+                io_error_size(make_error_code(error::eof)),
+                io_error_size(make_error_code(error::timeout))));
 
         BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
-        if (winner_index == 0)
-            BOOST_TEST_EQ(result.t1, 100u);
-        else
-            BOOST_TEST_EQ(result.t1, 200u);
     }
 
-    // Test: when_any with mixed io_task and regular task
+    // Spec Row 5: One task throws, others pending
+    // Exception does not win. Keep waiting for a success.
     void
-    testIoTaskMixedWithRegularTask()
+    testExceptionDoesNotWin()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-
-        auto io_op = []() -> io_task<std::size_t> {
-            co_return io_result<std::size_t>{{}, 42};
-        };
+        size_t winner_index = 999;
 
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, size_t, size_t> v) {
                 completed = true;
-                winner_index = r.first;
+                winner_index = v.index();
+                if(v.index() == 2)
+                    BOOST_TEST_EQ(std::get<2>(v), 55u);
             },
             [](std::exception_ptr) {})(
-            when_any(io_op(), returns_int(99)));
+            when_any(
+                io_throws_size("boom"),
+                io_success_size(55)));
 
         BOOST_TEST(completed);
-        BOOST_TEST(winner_index == 0 || winner_index == 1);
+        // Child 0 threw (discarded), child 1 won -> index 2
+        BOOST_TEST_EQ(winner_index, 2u);
     }
 
-    // Test: vector of event waiters (range overload with non-task IoAwaitable)
+    // Spec Row 6: All tasks throw
+    // No success possible. Rethrow first exception.
     void
-    testVectorOfEventWaiters()
+    testAllThrow()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-
-        async_event event1;
-        async_event event2;
-
-        std::vector<async_event::wait_awaiter> waiters;
-        waiters.push_back(event1.wait());
-        waiters.push_back(event2.wait());
+        bool caught = false;
+        std::string msg;
 
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, size_t, size_t>) {
                 completed = true;
-                winner_index = r.first;
             },
-            [](std::exception_ptr) {})(
-            when_any(std::move(waiters)));
-
-        // Set event1 to wake the first waiter
-        event1.set();
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
-
-        BOOST_TEST(completed);
-        BOOST_TEST_EQ(winner_index, 0u);
-    }
+            [&](std::exception_ptr ep) {
+                try { std::rethrow_exception(ep); }
+                catch (test_exception const& e) {
+                    caught = true;
+                    msg = e.what();
+                }
+            })(when_any(
+                io_throws_size("first"),
+                io_throws_size("second")));
 
-    void
-    run()
-    {
-        testStopOnlyAwaitableWithTask();
-        testAsyncEventWaitWithTask();
-        testTwoStopOnlyAwaitables();
-        testIoTaskWithWhenAny();
-        testIoTaskWithValueAndWhenAny();
-        testIoTaskMixedWithRegularTask();
-        testVectorOfEventWaiters();
+        BOOST_TEST(!completed);
+        BOOST_TEST(caught);
+        BOOST_TEST_EQ(msg, "second");
     }
-};
-
-TEST_SUITE(
-    when_any_io_awaitable_test,
-    "boost.capy.when_any_io_awaitable");
 
-//----------------------------------------------------------
-// IoAwaitableRange tests for when_any (range overloads
-// with non-task IoAwaitable element types)
-//----------------------------------------------------------
-
-struct when_any_io_awaitable_range_test
-{
-    // Test: vector of stop_only_awaitables (void range overload)
+    // Spec Row 7: Parent stop fires before any completion
+    // All children cancelled. Variant holds error_code at index 0 (ECANCELED).
     void
-    testVoidRangeStopOnlyAwaitables()
+    testCanceledAllFail()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
 
-        std::stop_source parent_stop;
-
-        std::vector<stop_only_awaitable> awaitables;
-        awaitables.push_back(stop_only_awaitable{});
-        awaitables.push_back(stop_only_awaitable{});
-        awaitables.push_back(stop_only_awaitable{});
-
-        run_async(ex, parent_stop.get_token(),
-            [&](std::size_t idx) {
+        run_async(ex,
+            [&](std::variant<std::error_code, size_t, size_t> v) {
                 completed = true;
-                winner_index = idx;
+                BOOST_TEST_EQ(v.index(), 0u);
+                auto ec = std::get<0>(v);
+                BOOST_TEST(ec == cond::canceled);
             },
             [](std::exception_ptr) {})(
-            when_any(std::move(awaitables)));
-
-        // All three are suspended waiting for stop
-        parent_stop.request_stop();
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            when_any(
+                io_error_size(make_error_code(error::canceled)),
+                io_error_size(make_error_code(error::canceled))));
 
         BOOST_TEST(completed);
-        BOOST_TEST(winner_index < 3);
     }
 
-    // Test: vector of event waiters (non-void range overload)
+    // Spec Row 8: ec == eof, n == 0
+    // Error. Does not win.
     void
-    testNonVoidRangeEventWaiters()
+    testEofDoesNotWin()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-        io_result<> winner_result;
-
-        async_event event0;
-        async_event event1;
-        async_event event2;
-
-        std::vector<async_event::wait_awaiter> waiters;
-        waiters.push_back(event0.wait());
-        waiters.push_back(event1.wait());
-        waiters.push_back(event2.wait());
+        size_t winner_index = 999;
 
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, size_t, size_t> v) {
                 completed = true;
-                winner_index = r.first;
-                winner_result = r.second;
+                winner_index = v.index();
             },
             [](std::exception_ptr) {})(
-            when_any(std::move(waiters)));
-
-        // Set event1 - second waiter wins
-        event1.set();
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            when_any(
+                io_error_size(make_error_code(error::eof), 0),
+                io_success_size(200)));
 
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(winner_index, 1u);
-        // Winner completed via set(), no error
-        BOOST_TEST(!winner_result.ec);
+        // EOF didn't win; child 1 (success) won -> index 2
+        BOOST_TEST_EQ(winner_index, 2u);
     }
 
-    // Test: vector of event waiters where winner is cancelled
+    // Spec Row 9: Immediate completion (await_ready true)
+    // Wins normally. No special treatment.
     void
-    testNonVoidRangeAllCancelled()
+    testImmediateCompletion()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-        io_result<> winner_result;
-
-        async_event event0;
-        async_event event1;
-
-        std::stop_source parent_stop;
 
-        std::vector<async_event::wait_awaiter> waiters;
-        waiters.push_back(event0.wait());
-        waiters.push_back(event1.wait());
-
-        run_async(ex, parent_stop.get_token(),
-            [&](auto&& r) {
+        run_async(ex,
+            [&](std::variant<std::error_code, size_t, size_t> v) {
                 completed = true;
-                winner_index = r.first;
-                winner_result = r.second;
+                // Immediate awaitable is child 0 -> index 1
+                BOOST_TEST_EQ(v.index(), 1u);
+                BOOST_TEST_EQ(std::get<1>(v), 77u);
             },
             [](std::exception_ptr) {})(
-            when_any(std::move(waiters)));
-
-        // Cancel from parent - no events set
-        parent_stop.request_stop();
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            when_any(
+                immediate_io_awaitable(77),
+                io_pending_size()));
 
         BOOST_TEST(completed);
-        BOOST_TEST(winner_index < 2);
-        // Winner completed via cancellation
-        BOOST_TEST_EQ(winner_result.ec,
-            make_error_code(error::canceled));
     }
 
-    // Test: single-element range of non-task IoAwaitable
+    // Spec Row 10 (mixed types)
     void
-    testSingleElementRange()
+    testMixedTypes()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-
-        async_event event;
-
-        std::vector<async_event::wait_awaiter> waiters;
-        waiters.push_back(event.wait());
 
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, size_t, std::string> v) {
                 completed = true;
-                winner_index = r.first;
+                // First child (size_t) succeeds -> index 1
+                BOOST_TEST_EQ(v.index(), 1u);
+                BOOST_TEST_EQ(std::get<1>(v), 42u);
             },
             [](std::exception_ptr) {})(
-            when_any(std::move(waiters)));
-
-        event.set();
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            when_any(
+                io_success_size(42),
+                io_success_string("hello")));
 
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(winner_index, 0u);
     }
 
-    // Test: empty range of non-task IoAwaitable throws
+    // Single task fails -> variant at index 0
     void
-    testEmptyRangeThrows()
+    testSingleTaskError()
     {
-        int dispatch_count = 0;
-        test_executor ex(dispatch_count);
-        bool caught_exception = false;
-
-        std::vector<async_event::wait_awaiter> waiters;
+        int dc = 0;
+        test_executor ex(dc);
+        bool completed = false;
 
         run_async(ex,
-            [](auto&&) {},
-            [&](std::exception_ptr ep) {
-                try {
-                    std::rethrow_exception(ep);
-                } catch (std::invalid_argument const&) {
-                    caught_exception = true;
-                }
-            })(when_any(std::move(waiters)));
+            [&](std::variant<std::error_code, size_t> v) {
+                completed = true;
+                BOOST_TEST_EQ(v.index(), 0u);
+                BOOST_TEST(std::get<0>(v) == cond::eof);
+            },
+            [](std::exception_ptr) {})(
+            when_any(
+                io_error_size(make_error_code(error::eof))));
 
-        BOOST_TEST(caught_exception);
+        BOOST_TEST(completed);
     }
 
-    // Test: event waiters where first event is already set
+    // Last failure wins. Error child runs last, so error is reported.
     void
-    testAlreadySetEventInRange()
+    testLastFailureWins()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-        io_result<> winner_result;
-
-        async_event event0;
-        async_event event1;
-
-        // Set event0 before creating waiters
-        event0.set();
-
-        std::vector<async_event::wait_awaiter> waiters;
-        waiters.push_back(event0.wait());
-        waiters.push_back(event1.wait());
+        std::error_code result_ec;
 
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, size_t, size_t> v) {
                 completed = true;
-                winner_index = r.first;
-                winner_result = r.second;
+                BOOST_TEST_EQ(v.index(), 0u);
+                result_ec = std::get<0>(v);
             },
             [](std::exception_ptr) {})(
-            when_any(std::move(waiters)));
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            when_any(
+                io_throws_size("exception"),
+                io_error_size(make_error_code(error::eof))));
 
         BOOST_TEST(completed);
-        // event0 was already set, so waiter 0 completes immediately
-        BOOST_TEST_EQ(winner_index, 0u);
-        BOOST_TEST(!winner_result.ec);
+        BOOST_TEST(result_ec == cond::eof);
     }
 
-    // Test: large range of non-task IoAwaitables
+    // Winner identification: variant.index() - 1 == winning child index
     void
-    testLargeRange()
+    testWinnerIndex()
     {
-        std::queue<std::coroutine_handle<>> work_queue;
-        queuing_executor ex(work_queue);
+        int dc = 0;
+        test_executor ex(dc);
         bool completed = false;
-        std::size_t winner_index = 999;
-
-        constexpr std::size_t count = 20;
-        std::vector<async_event> events(count);
-
-        std::vector<async_event::wait_awaiter> waiters;
-        for (std::size_t i = 0; i < count; ++i)
-            waiters.push_back(events[i].wait());
 
+        // Child 0 fails, child 1 wins -> variant.index() == 2
         run_async(ex,
-            [&](auto&& r) {
+            [&](std::variant<std::error_code, size_t, size_t> v) {
                 completed = true;
-                winner_index = r.first;
+                BOOST_TEST_EQ(v.index(), 2u);
             },
             [](std::exception_ptr) {})(
-            when_any(std::move(waiters)));
-
-        // Set the 15th event
-        events[15].set();
-
-        while (!work_queue.empty()) {
-            auto h = work_queue.front();
-            work_queue.pop();
-            h.resume();
-        }
+            when_any(
+                io_error_size(make_error_code(error::eof)),
+                io_success_size(42)));
 
         BOOST_TEST(completed);
-        BOOST_TEST_EQ(winner_index, 15u);
     }
 
     void
     run()
     {
-        testVoidRangeStopOnlyAwaitables();
-        testNonVoidRangeEventWaiters();
-        testNonVoidRangeAllCancelled();
-        testSingleElementRange();
-        testEmptyRangeThrows();
-        testAlreadySetEventInRange();
-        testLargeRange();
+        testFirstSuccessWins();
+        testSingleTaskSuccess();
+        testErrorDoesNotWin();
+        testSuccessAfterFailure();
+        testAllFail();
+        testExceptionDoesNotWin();
+        testAllThrow();
+        testCanceledAllFail();
+        testEofDoesNotWin();
+        testImmediateCompletion();
+        testMixedTypes();
+        testSingleTaskError();
+        testLastFailureWins();
+        testWinnerIndex();
     }
 };
 
 TEST_SUITE(
-    when_any_io_awaitable_range_test,
-    "boost.capy.when_any_io_awaitable_range");
+    when_any_io_result_test,
+    "boost.capy.when_any_io_result");
 
 } // capy
 } // boost
diff --git a/test/unit/write.cpp b/test/unit/write.cpp
index a80185e8b..3dcbe98a5 100644
--- a/test/unit/write.cpp
+++ b/test/unit/write.cpp
@@ -10,7 +10,6 @@
 // Test that header file is self-contained.
 #include <boost/capy/write.hpp>
 
-#include <boost/capy/buffers/buffer_pair.hpp>
 #include <boost/capy/buffers/make_buffer.hpp>
 #include <boost/capy/error.hpp>
 #include <boost/capy/test/fuse.hpp>
@@ -97,7 +96,7 @@ struct buffer_pair_factory
     {
     }
 
-    const_buffer_pair
+    std::array<const_buffer, 2>
     buffer() const
     {
         return {{
@@ -252,12 +251,45 @@ struct write_test
         }));
     }
 
+    // Regression: capy#263. Free-function write() must take its buffer
+    // sequence by value so that storing the returned awaitable past
+    // the full-expression that created the sequence does not dangle.
+    void
+    testWriteStoredAwaitableTemporarySequence()
+    {
+        BOOST_TEST(test::fuse().armed([](test::fuse& f) -> task<void>
+        {
+            test::write_stream ws(f);
+
+            char const data1[] = "hello";
+            char const data2[] = "world";
+
+            // The std::array<const_buffer, 2> argument is a temporary
+            // that ends its lifetime at the end of this full-expression.
+            auto aw = write(ws, std::array<const_buffer, 2>{{
+                const_buffer(data1, 5),
+                const_buffer(data2, 5)
+            }});
+
+            // If write() bound the sequence by const&, the awaitable now
+            // holds a dangling reference and the next line trips ASan
+            // (or silently reads stale stack).
+            auto [ec, n] = co_await std::move(aw);
+            if(ec)
+                co_return;
+
+            BOOST_TEST_EQ(n, 10u);
+            BOOST_TEST_EQ(ws.data(), "helloworld");
+        }));
+    }
+
     void
     testWriteStream()
     {
         testWriteSingleBuffer();
         testWriteBufferArray();
         testWriteBufferPair();
+        testWriteStoredAwaitableTemporarySequence();
     }
 
     //----------------------------------------------------------