From b288bbc8ab4ce464922a228995e8aa27aafcaf6d Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 4 May 2026 19:57:45 +0000
Subject: [PATCH 01/36] Implement groupby ``all``/``any`` via bool-coercion +
 min/max

Both methods previously raised ``NotImplementedError``. Reduce ``all``/
``any`` to ``min``/``max`` on a bool-coerced copy of the value columns:

- Strings coerce as ``count_characters > 0`` so empty strings become
  ``False`` and nulls remain null (preserving them through the agg).
- Numerics coerce as ``!= 0`` with the same null preservation.
- ``skipna=False`` replaces nulls with ``True`` before the aggregation
  so that nulls don't flip ``all`` to ``False`` and trivially make
  ``any`` ``True``.
- Empty groups (all-NA values, skipna=True) yield NA from min/max;
  pandas treats those as vacuously ``True`` for ``all`` and ``False``
  for ``any``, so the result is filled accordingly.
- ``min_count`` masks groups whose non-null count is below the
  threshold.

Conftest update for ``test_string_dtype_all_na[*-all-*]`` and
``[*-any-*]`` (32 entries). The string-key DataFrame cases additionally
rely on identity-based grouping-key column exclusion, which lands in
a sibling PR; both must merge before the entries can be removed
without xpassing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/cudf/cudf/core/groupby/groupby.py      | 110 ++++++++++++++++--
 .../cudf/pandas/scripts/conftest-patch.py     |  32 -----
 .../cudf/tests/groupby/test_reductions.py     |  45 +++++++
 3 files changed, 148 insertions(+), 39 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 0ad04470a5f..ddbbc0caf9c 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -32,7 +32,10 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.common import pipe
 from cudf.core.copy_types import GatherMap
-from cudf.core.dtype.validators import is_dtype_obj_numeric
+from cudf.core.dtype.validators import (
+    is_dtype_obj_numeric,
+    is_dtype_obj_string,
+)
 from cudf.core.dtypes import (
     CategoricalDtype,
     DecimalDtype,
@@ -2991,18 +2994,111 @@ def ewm(self, *args, **kwargs):
     def any(self, skipna: bool = True, min_count: int = 0, **kwargs: Any):
         """
         Return True if any value in the group is truthful, else False.
-
-        Currently not implemented.
         """
-        raise NotImplementedError("any is currently not implemented")
+        return self._bool_reduce("any", skipna=skipna, min_count=min_count)
 
     def all(self, skipna: bool = True, min_count: int = 0, **kwargs: Any):
         """
         Return True if all values in the group are truthful, else False.
-
-        Currently not implemented.
         """
-        raise NotImplementedError("all is currently not implemented")
+        return self._bool_reduce("all", skipna=skipna, min_count=min_count)
+
+    def _bool_reduce(self, op: str, *, skipna: bool, min_count: int):
+        """Implement all/any as min/max on bool-coerced value columns."""
+        from cudf.core.dataframe import DataFrame
+        from cudf.core.series import Series
+
+        agg_name = {"all": "min", "any": "max"}[op]
+        # Empty-group fill value: vacuously True for all, vacuously False for any
+        fill_value = op == "all"
+
+        is_series = isinstance(self.obj, Series)
+
+        # Coerce each value column to a (nullable) bool column so that
+        # nulls are preserved through the aggregation (min/max skip
+        # nulls). For ``skipna=False``, nulls are replaced with True so
+        # they don't flip ``all`` to False and always make ``any`` True.
+        def _to_bool_col(col):
+            from cudf.core.column import ColumnBase
+
+            if isinstance(col.dtype, pd.StringDtype) or is_dtype_obj_string(
+                col.dtype
+            ):
+                counts_plc = plc.strings.attributes.count_characters(
+                    col.plc_column
+                )
+                gt_plc = plc.binaryop.binary_operation(
+                    counts_plc,
+                    plc.Scalar.from_py(0),
+                    plc.binaryop.BinaryOperator.GREATER,
+                    plc.DataType(plc.TypeId.BOOL8),
+                )
+                bool_col = ColumnBase.create(gt_plc, np.dtype(np.bool_))
+            else:
+                # For numeric/bool inputs, cast to bool preserving nulls.
+                ne_plc = plc.binaryop.binary_operation(
+                    col.plc_column,
+                    plc.Scalar.from_py(0),
+                    plc.binaryop.BinaryOperator.NOT_EQUAL,
+                    plc.DataType(plc.TypeId.BOOL8),
+                )
+                bool_col = ColumnBase.create(ne_plc, np.dtype(np.bool_))
+            if not skipna:
+                bool_col = bool_col.fillna(True)
+            return bool_col
+
+        if is_series:
+            new_obj = Series._from_column(
+                _to_bool_col(self.obj._column), name=self.obj.name
+            )
+        else:
+            new_data = {
+                col_name: _to_bool_col(self.obj._data[col_name])
+                for col_name in self.grouping._values_column_names
+            }
+            new_obj = DataFrame._from_data(new_data, index=self.obj.index)
+
+        # Reuse the same grouping so key columns match ``new_obj`` exactly,
+        # avoiding label-based lookup when the key column was excluded.
+        bool_gb = type(self)(
+            new_obj,
+            by=self.grouping,
+            level=None,
+            sort=self._sort,
+            as_index=self._as_index,
+            dropna=self._dropna,
+        )
+        result = bool_gb.agg(agg_name)
+
+        # Empty groups (skipna=True with all-NA values) yield NA from
+        # min/max — pandas treats these as ``True`` for ``all`` and
+        # ``False`` for ``any``.
+        bool_np = np.dtype(np.bool_)
+        if isinstance(result, Series):
+            result = result.fillna(fill_value).astype(bool_np)
+        else:
+            for col_name in result._column_names:
+                result[col_name] = (
+                    result[col_name].fillna(fill_value).astype(bool_np)
+                )
+
+        if min_count and min_count > 0:
+            counts = self.agg("count")
+            if isinstance(result, Series):
+                count_series = (
+                    counts if isinstance(counts, Series) else counts.iloc[:, 0]
+                )
+                result = result.where(count_series >= min_count, None)
+            else:
+                for col_name in result._column_names:
+                    if col_name not in counts._column_names:
+                        continue
+                    count_col = counts._data[col_name]
+                    mask = count_col < min_count
+                    result[col_name] = result[col_name].where(
+                        ~Series._from_column(mask), None
+                    )
+        return result
 
 
 class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 9817fb48c25..66dca3fdeb2 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -3857,14 +3857,6 @@ def pytest_unconfigure(config):
     "tests/groupby/test_reductions.py::test_nunique_with_empty_series": "TODO: Add a reason for failure",
     "tests/groupby/test_reductions.py::test_nunique_with_object": "TODO: Add a reason for failure",
     "tests/groupby/test_reductions.py::test_nunique_with_timegrouper": "TODO: Add a reason for failure",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-True-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-True-False-1]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-count-False-False-0]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-count-False-False-1]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-count-True-False-0]": "AssertionError: DataFrame are different",
@@ -3897,14 +3889,6 @@ def pytest_unconfigure(config):
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-True-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-True-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-True-False-1]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-count-False-False-0]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-count-False-False-1]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-count-True-False-0]": "AssertionError: DataFrame are different",
@@ -3945,14 +3929,6 @@ def pytest_unconfigure(config):
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-False-False-1]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-True-False-0]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-True-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-False-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-False-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-1]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-count-False-False-0]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-count-False-False-1]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-count-False-True-0]": "AssertionError: Attributes of Series are different",
@@ -4001,14 +3977,6 @@ def pytest_unconfigure(config):
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-True-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-False-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-False-False-1]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-0]": "AssertionError: DataFrame are different",
-    "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-1]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-count-False-False-0]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-count-False-False-1]": "AssertionError: DataFrame are different",
     "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-count-False-True-0]": "AssertionError: Attributes of Series are different",
diff --git a/python/cudf/cudf/tests/groupby/test_reductions.py b/python/cudf/cudf/tests/groupby/test_reductions.py
index fc664bae59a..2e9efe26f47 100644
--- a/python/cudf/cudf/tests/groupby/test_reductions.py
+++ b/python/cudf/cudf/tests/groupby/test_reductions.py
@@ -1189,3 +1189,48 @@ def test_string_groupby_key_index():
     got = gdf.groupby("a", sort=True).count()
 
     assert_eq(expect, got, check_dtype=False)
+
+
+@pytest.mark.parametrize("op", ["all", "any"])
+@pytest.mark.parametrize(
+    "data",
+    [
+        [True, False, True, True, False, False],
+        [1, 0, 2, 3, 0, 0],
+        [1.0, 0.0, 2.5, 3.5, 0.0, 0.0],
+    ],
+)
+def test_groupby_all_any(op, data):
+    pdf = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": data})
+    gdf = cudf.from_pandas(pdf)
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = getattr(gdf.groupby("a"), op)()
+    expect = getattr(pdf.groupby("a"), op)()
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("op", ["all", "any"])
+def test_groupby_all_any_string(op):
+    pdf = pd.DataFrame(
+        {"a": [1, 1, 2, 2, 3, 3], "b": ["x", "", "", "", "y", "z"]}
+    )
+    gdf = cudf.from_pandas(pdf)
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = getattr(gdf.groupby("a"), op)()
+    expect = getattr(pdf.groupby("a"), op)()
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("op", ["all", "any"])
+def test_groupby_all_any_empty(op):
+    pdf = pd.DataFrame(
+        {
+            "a": pd.array([], dtype="int64"),
+            "b": pd.array([], dtype="bool"),
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = getattr(gdf.groupby("a"), op)()
+    expect = getattr(pdf.groupby("a"), op)()
+    assert_eq(expect, got, check_index_type=False)

From 8992d3983daab6ae9c4db6f091d3c4a8e8f47f70 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 6 May 2026 15:49:03 -0500
Subject: [PATCH 02/36] Apply suggestions from code review

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 python/cudf/cudf/core/groupby/groupby.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index ddbbc0caf9c..8538953ea7e 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -3021,9 +3021,7 @@ def _bool_reduce(self, op: str, *, skipna: bool, min_count: int):
         def _to_bool_col(col):
             from cudf.core.column import ColumnBase
 
-            if isinstance(col.dtype, pd.StringDtype) or is_dtype_obj_string(
-                col.dtype
-            ):
+            if is_dtype_obj_string(col.dtype):
                 counts_plc = plc.strings.attributes.count_characters(
                     col.plc_column
                 )

From 82852237e27b48b42b5f9afe961b236abe325740 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 4 May 2026 16:56:54 -0500
Subject: [PATCH 03/36] Reject pd.NA string-to-object casts in
 pandas-compatible mode (#22295)

## Description

In pandas-compatible mode, reject casting nullable string columns that
use `pd.NA` as their missing-value sentinel to numpy `object` dtype.

This came from a pandas 3 compatibility issue in `cudf.pandas`: pandas
preserves `pd.NA` when `StringDtype(na_value=pd.NA)` is cast to
`object`, while cuDF's string-to-object path materializes nulls as
Python `None`. Preserving that sentinel would require carrying source
dtype metadata after the result has become plain `object`, which the
review pointed out is not a good fit for the current column model.

Instead, when `mode.pandas_compatible` is enabled, this PR now raises in
`StringColumn.as_string_column` for:

- `pd.StringDtype(..., na_value=pd.NA)` -> `object`
- string `pd.ArrowDtype` -> `object`

Outside pandas-compatible mode, the existing string-to-object cast
behavior is unchanged. String dtypes that use `np.nan` as their
missing-value sentinel and ordinary object string columns also keep the
existing behavior.

## Changes

- Add an explicit pandas-compatible-mode `NotImplementedError` for
nullable `pd.NA` string-to-object casts in
`python/cudf/cudf/core/column/string.py`.
- Add focused coverage in
`python/cudf/cudf/tests/series/methods/test_astype.py` for both
pandas-compatible and non-pandas-compatible behavior.
- Remove the previous per-instance `_PANDAS_NA_VALUE` override path.

## Checklist

- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.
---
 python/cudf/cudf/core/column/string.py        |  12 ++
 .../cudf/pandas/scripts/conftest-patch.py     | 126 ------------------
 .../cudf/tests/series/methods/test_astype.py  |  40 ++++++
 3 files changed, 52 insertions(+), 126 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 1e13c23ec8d..47ab9262ac5 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -338,6 +338,18 @@ def as_string_column(self, dtype: DtypeObj) -> Self:
         if isinstance(dtype, np.dtype) and dtype.kind == "U":
             dtype = np.dtype("object")
         if dtype != self.dtype:
+            if (
+                cudf.get_option("mode.pandas_compatible")
+                and self.null_count != 0
+                and isinstance(dtype, np.dtype)
+                and dtype == np.dtype("O")
+                and isinstance(self.dtype, (pd.StringDtype, pd.ArrowDtype))
+                and self.dtype.na_value is pd.NA
+            ):
+                raise NotImplementedError(
+                    "Casting nullable string columns with pd.NA to object "
+                    "is not supported."
+                )
             return cast(Self, ColumnBase.create(self.plc_column, dtype))
         return self
 
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 66dca3fdeb2..8e65569d557 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -1425,7 +1425,6 @@ def pytest_unconfigure(config):
     "tests/extension/test_arrow.py::TestArrowArray::test_grouping_grouper[uint32]": "AssertionError: ndarray Expected type <class 'numpy.ndarray'>, found <class 'pandas.arrays.ArrowStringArray'> instead",
     "tests/extension/test_arrow.py::TestArrowArray::test_grouping_grouper[uint64]": "AssertionError: ndarray Expected type <class 'numpy.ndarray'>, found <class 'pandas.arrays.ArrowStringArray'> instead",
     "tests/extension/test_arrow.py::TestArrowArray::test_grouping_grouper[uint8]": "AssertionError: ndarray Expected type <class 'numpy.ndarray'>, found <class 'pandas.arrays.ArrowStringArray'> instead",
-    "tests/extension/test_arrow.py::TestArrowArray::test_loc_setitem_with_expansion_preserves_ea_index_dtype[string]": "AssertionError: DataFrame.index are different",
     "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[bool-prod-True]": "AssertionError: Attributes of ExtensionArray are different",
     "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[bool-sum-True]": "AssertionError: Attributes of ExtensionArray are different",
     "tests/extension/test_arrow.py::TestArrowArray::test_reduce_frame[decimal128(7, 3)-mean-False]": "TODO: Add a reason for failure",
@@ -1491,13 +1490,6 @@ def pytest_unconfigure(config):
     "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[uint64-kurt-True]": "TODO: Add a reason for failure",
     "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[uint8-kurt-False]": "TODO: Add a reason for failure",
     "tests/extension/test_arrow.py::TestArrowArray::test_reduce_series_numeric[uint8-kurt-True]": "TODO: Add a reason for failure",
-    "tests/extension/test_arrow.py::TestArrowArray::test_setitem_series[string-full_slice]": "AssertionError: Series are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_setitem_series[string-index]": "AssertionError: Series are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_setitem_series[string-list(range)]": "AssertionError: Series are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_setitem_series[string-list[index]]": "AssertionError: Series are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_setitem_series[string-mask]": "AssertionError: Series are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_setitem_series[string-null_slice]": "AssertionError: Series are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_setitem_series[string-range]": "AssertionError: Series are different",
     "tests/extension/test_arrow.py::TestArrowArray::test_setitem_with_expansion_row[string]": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='data') are different",
     "tests/extension/test_arrow.py::TestArrowArray::test_unstack[decimal128(7, 3)-frame-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
     "tests/extension/test_arrow.py::TestArrowArray::test_unstack[decimal128(7, 3)-frame-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
@@ -1505,12 +1497,6 @@ def pytest_unconfigure(config):
     "tests/extension/test_arrow.py::TestArrowArray::test_unstack[decimal128(7, 3)-series-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
     "tests/extension/test_arrow.py::TestArrowArray::test_unstack[decimal128(7, 3)-series-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
     "tests/extension/test_arrow.py::TestArrowArray::test_unstack[decimal128(7, 3)-series-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='A') are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-frame-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-frame-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-frame-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='('A', 'A')') are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-series-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-series-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
-    "tests/extension/test_arrow.py::TestArrowArray::test_unstack[string-series-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='A') are different",
     "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_and": "TODO: Add a reason for failure",
     "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_and_scalar[False-expected3]": "TODO: Add a reason for failure",
     "tests/extension/test_arrow.py::TestLogicalOps::test_kleene_and_scalar[None-expected0]": "TODO: Add a reason for failure",
@@ -2090,30 +2076,6 @@ def pytest_unconfigure(config):
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
     "tests/extension/test_string.py::TestStringArray::test_unstack[string=str[python]-True-series-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='A') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-frame-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-frame-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-frame-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='('A', 'A')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-series-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-series-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-False-series-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='A') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-frame-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-frame-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-frame-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='('A', 'A')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-series-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-series-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[pyarrow]-True-series-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='A') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-frame-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-frame-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-frame-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='('A', 'A')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-series-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-series-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-False-series-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='A') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-frame-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-frame-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='('A', 'B')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-frame-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='('A', 'A')') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-series-index1]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-series-index2]": "AssertionError: DataFrame.iloc[:, 1] (column name='B') are different",
-    "tests/extension/test_string.py::TestStringArray::test_unstack[string=string[python]-True-series-index3]": "AssertionError: DataFrame.iloc[:, 0] (column name='A') are different",
     "tests/frame/constructors/test_from_records.py::TestFromRecords::test_from_records_empty": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='a') are different",
     "tests/frame/constructors/test_from_records.py::TestFromRecords::test_from_records_empty_iterator_with_preserve_columns": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='col_1') are different",
     "tests/frame/constructors/test_from_records.py::TestFromRecords::test_from_records_misc_brokenness": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='a') are different",
@@ -4475,7 +4437,6 @@ def pytest_unconfigure(config):
     "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_with_method_numeric_vs_bool[backfill]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_with_method_numeric_vs_bool[nearest]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_indexer_with_method_numeric_vs_bool[pad]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/indexes/numeric/test_indexing.py::TestGetIndexer::test_get_loc_masked_na_and_nan[False]": "ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()",
     "tests/indexes/numeric/test_numeric.py::TestFloatNumericIndex::test_equals_numeric": "TODO: Add a reason for failure",
     "tests/indexes/numeric/test_numeric.py::TestFloatNumericIndex::test_type_coercion_fail[int16]": "TODO: Add a reason for failure",
     "tests/indexes/numeric/test_numeric.py::TestFloatNumericIndex::test_type_coercion_fail[int32]": "TODO: Add a reason for failure",
@@ -4558,12 +4519,6 @@ def pytest_unconfigure(config):
     "tests/indexes/test_any_index.py::TestConversion::test_to_series_with_arguments[uint8]": "TODO: Add a reason for failure",
     "tests/indexes/test_base.py::TestIndex::test_cached_properties_not_settable": "TODO: Add a reason for failure",
     "tests/indexes/test_base.py::TestIndex::test_constructor_dtypes_datetime[None-Index-values]": "TODO: Add a reason for failure",
-    "tests/indexes/test_base.py::TestIndex::test_empty_fancy[bool-float32]": "Failed: DID NOT RAISE <class 'ValueError'>",
-    "tests/indexes/test_base.py::TestIndex::test_empty_fancy[bool-float64]": "Failed: DID NOT RAISE <class 'ValueError'>",
-    "tests/indexes/test_base.py::TestIndex::test_empty_fancy[bool-int32]": "Failed: DID NOT RAISE <class 'ValueError'>",
-    "tests/indexes/test_base.py::TestIndex::test_empty_fancy[bool-int64]": "Failed: DID NOT RAISE <class 'ValueError'>",
-    "tests/indexes/test_base.py::TestIndex::test_empty_fancy[bool-uint32]": "Failed: DID NOT RAISE <class 'ValueError'>",
-    "tests/indexes/test_base.py::TestIndex::test_empty_fancy[bool-uint64]": "Failed: DID NOT RAISE <class 'ValueError'>",
     "tests/indexes/test_base.py::TestIndex::test_equals_op_mismatched_multiindex_raises[index0]": "TODO: Add a reason for failure",
     "tests/indexes/test_base.py::TestIndex::test_is_": "TODO: Add a reason for failure",
     "tests/indexes/test_base.py::TestIndex::test_is_object[string-True]": "AssertionError: assert True is False",
@@ -5999,7 +5954,6 @@ def pytest_unconfigure(config):
     "tests/resample/test_resample_api.py::test_agg_with_lambda[df_resample-agg1]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_agg_with_lambda[df_resample-agg2]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_groupby_resample_api": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_resample_group_keys": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_transform_frame[None]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_transform_frame[date]": "TODO: Add a reason for failure",
     "tests/resample/test_resampler_grouper.py::test_apply_columns_multilevel": "AssertionError: (<Hour>, None)",
@@ -6200,8 +6154,6 @@ def pytest_unconfigure(config):
     "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_on_ints_floats[int_vals1-float_vals1-exp_vals1]": "TODO: Add a reason for failure",
     "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_on_ints_floats[int_vals2-float_vals2-exp_vals2]": "TODO: Add a reason for failure",
     "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_on_ints_floats_warning": "TODO: Add a reason for failure",
-    "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_with_intc_columns": "AssertionError: DataFrame.iloc[:, 0] (column name='a') are different",
-    "tests/reshape/merge/test_merge.py::TestMergeDtypes::test_merge_with_uintc_columns": "AssertionError: DataFrame.iloc[:, 0] (column name='a') are different",
     "tests/reshape/merge/test_merge.py::test_merge_arrow_and_numpy_dtypes[int64]": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='a') are different",
     "tests/reshape/merge/test_merge.py::test_merge_different_index_names": "TODO: Add a reason for failure",
     "tests/reshape/merge/test_merge.py::test_merge_ea_and_non_ea[Float32-right]": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='a') are different",
@@ -7356,86 +7308,8 @@ def pytest_unconfigure(config):
     "tests/strings/test_split_partition.py::test_partition_series_stdlib[string=string[pyarrow]-rpartition]": "TODO: Add a reason for failure",
     "tests/strings/test_split_partition.py::test_partition_series_stdlib[string=string[python]-partition]": "TODO: Add a reason for failure",
     "tests/strings/test_split_partition.py::test_partition_series_stdlib[string=string[python]-rpartition]": "TODO: Add a reason for failure",
-    "tests/strings/test_split_partition.py::test_split_blank_string_with_non_empty[string=object]": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='0') are different",
     "tests/strings/test_split_partition.py::test_split_nan_expand[string=object]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_split_partition.py::test_split_to_dataframe_unequal_splits[string=object]": "AssertionError: Attributes of DataFrame.iloc[:, 3] (column name='3') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-capitalize]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-casefold]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-cat1]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-center]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-extract0]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-extract1]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-extractall]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-get]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-join]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-ljust]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-lower]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-lstrip]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-normalize]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-pad]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-partition1]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-partition2]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-removeprefix]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-removesuffix]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-repeat]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-replace]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-rjust]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-rpartition1]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-rpartition2]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-rstrip]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-slice0]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-slice1]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-slice_replace0]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-slice_replace1]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-split1]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-strip]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-swapcase]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-title]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-translate]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-upper]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-wrap]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[pyarrow]-zfill]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-capitalize]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-casefold]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-cat1]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-center]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-extract0]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-extract1]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-extractall]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-get]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-join]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-ljust]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-lower]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-lstrip]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-normalize]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-pad]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-partition1]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-partition2]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-removeprefix]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-removesuffix]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-repeat]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-replace]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-rjust]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-rpartition1]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-rpartition2]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-rstrip]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-slice0]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-slice1]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-slice_replace0]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-slice_replace1]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-split1]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-strip]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-swapcase]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-title]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-translate]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-upper]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-wrap]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array[string[python]-zfill]": "AssertionError: Series are different",
-    "tests/strings/test_string_array.py::test_string_array_extract[string[pyarrow]]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
-    "tests/strings/test_string_array.py::test_string_array_extract[string[python]]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
     "tests/strings/test_strings.py::test_index_str_accessor_multiindex_raises": "TODO: Add a reason for failure",
-    "tests/strings/test_strings.py::test_split_join_roundtrip[string=str[pyarrow]]": "AssertionError: Series are different",
-    "tests/strings/test_strings.py::test_split_join_roundtrip[string=str[python]]": "AssertionError: Series are different",
     "tests/strings/test_strings.py::test_split_join_roundtrip[string=string[pyarrow]]": "AssertionError: Series are different",
     "tests/strings/test_strings.py::test_split_join_roundtrip[string=string[python]]": "AssertionError: Series are different",
     "tests/strings/test_strings.py::test_string_slice_out_of_bounds[string=object]": "AssertionError: Series are different",
diff --git a/python/cudf/cudf/tests/series/methods/test_astype.py b/python/cudf/cudf/tests/series/methods/test_astype.py
index 2f809b7c6d9..bf6a1b866e0 100644
--- a/python/cudf/cudf/tests/series/methods/test_astype.py
+++ b/python/cudf/cudf/tests/series/methods/test_astype.py
@@ -69,6 +69,46 @@ def test_series_typecast_to_object():
         assert new_series[0] == "1970-01-01 00:00:00.000000001"
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        pd.StringDtype(storage="python", na_value=pd.NA),
+        pd.StringDtype(storage="pyarrow", na_value=pd.NA),
+        pd.ArrowDtype(pa.string()),
+    ],
+)
+def test_string_astype_object_pd_na_pandas_compat(dtype):
+    sr = cudf.Series(["a", None, "b"], dtype=dtype)
+
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(
+            NotImplementedError,
+            match="Casting nullable string columns with pd.NA to object",
+        ):
+            sr.astype(object)
+
+    with cudf.option_context("mode.pandas_compatible", False):
+        result = sr.astype(object)
+    assert result.dtype == np.dtype("object")
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        pd.StringDtype(storage="python", na_value=pd.NA),
+        pd.StringDtype(storage="pyarrow", na_value=pd.NA),
+        pd.ArrowDtype(pa.string()),
+    ],
+)
+def test_string_astype_object_pd_na_pandas_compat_no_nulls(dtype):
+    sr = cudf.Series(["a", "b", "c"], dtype=dtype)
+
+    with cudf.option_context("mode.pandas_compatible", True):
+        result = sr.astype(object)
+    assert result.dtype == np.dtype("object")
+    assert result.to_arrow().to_pylist() == ["a", "b", "c"]
+
+
 @pytest.mark.parametrize(
     "dtype",
     [

From ca6dddcf990ed72df0f64887eb3afbb1aaa647cc Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Mon, 4 May 2026 20:24:34 +0200
Subject: [PATCH 04/36] Remove legacy Dask-based streaming backends (#22358)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Drops the legacy `Cluster.DISTRIBUTED` cluster and the entire `rapidsmpf.integrations.dask` execution path. The new `DaskEngine` (`Cluster.DASK`) is unaffected.

Note: all removed components were under `experimental`, so no deprecation period is required.


**What’s removed**

* `Cluster.DISTRIBUTED` enum value and all dispatch paths (`rapidsmpf/core.py`, `parallel.py:get_scheduler`)
* `experimental/dask_registers.py`, `experimental/spilling.py`, `experimental/rapidsmpf/dask.py`
* `rapidsmpf_distributed_available()`, `StreamingExecutor.rapidsmpf_spill`, and `cluster_kind` plumbing in `shuffle.py` and `sort.py`
* Legacy benchmark harness (`benchmarks/utils_legacy.py`) and the `utils.py` dispatch shim
* Legacy test suite (`tests/experimental/legacy/`) and Dask registration test files

**What stays**

* `Cluster.DASK` / `DaskEngine` (`frontend/dask.py`), the supported Dask backend
* `Cluster.SINGLE`, `SPMD`, and `RAY` streaming frontends
* The task-graph backend (`Runtime.TASKS`).

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Matthew Murray (https://github.com/Matt711)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/22358
---
 ci/run_cudf_polars_experimental_pytests.sh    |    8 +-
 python/cudf_polars/cudf_polars/callback.py    |   12 +-
 .../experimental/benchmarks/utils.py          |   59 +-
 .../experimental/benchmarks/utils_legacy.py   | 2100 -----------------
 .../experimental/dask_registers.py            |  227 --
 .../cudf_polars/experimental/io.py            |   10 +-
 .../cudf_polars/experimental/parallel.py      |   77 +-
 .../experimental/rapidsmpf/core.py            |   31 +-
 .../experimental/rapidsmpf/dask.py            |  194 --
 .../cudf_polars/experimental/rapidsmpf/io.py  |    2 +-
 .../cudf_polars/experimental/shuffle.py       |   65 +-
 .../cudf_polars/experimental/sort.py          |   65 +-
 .../cudf_polars/experimental/spilling.py      |  148 --
 .../cudf_polars/cudf_polars/utils/config.py   |   97 +-
 python/cudf_polars/tests/conftest.py          |    8 +-
 .../tests/experimental/legacy/__init__.py     |    8 -
 .../tests/experimental/legacy/conftest.py     |   46 -
 .../experimental/legacy/test_distributed.py   |   67 -
 .../tests/experimental/legacy/test_explain.py |   89 -
 .../experimental/legacy/test_parallel.py      |  127 -
 .../tests/experimental/legacy/test_shuffle.py |  101 -
 .../experimental/legacy/test_shuffler.py      |   79 -
 .../tests/experimental/legacy/test_sort.py    |  152 --
 .../tests/experimental/test_dask_serialize.py |  129 -
 .../tests/experimental/test_dask_sizeof.py    |   32 -
 .../tests/experimental/test_dask_tokenize.py  |   32 -
 .../tests/experimental/test_io_multirank.py   |    2 +-
 .../tests/experimental/test_sink.py           |   13 -
 python/cudf_polars/tests/test_config.py       |   77 +-
 python/cudf_polars/tests/test_profile.py      |    5 +-
 30 files changed, 106 insertions(+), 3956 deletions(-)
 delete mode 100644 python/cudf_polars/cudf_polars/experimental/benchmarks/utils_legacy.py
 delete mode 100644 python/cudf_polars/cudf_polars/experimental/dask_registers.py
 delete mode 100644 python/cudf_polars/cudf_polars/experimental/rapidsmpf/dask.py
 delete mode 100644 python/cudf_polars/cudf_polars/experimental/spilling.py
 delete mode 100644 python/cudf_polars/tests/experimental/legacy/__init__.py
 delete mode 100644 python/cudf_polars/tests/experimental/legacy/conftest.py
 delete mode 100644 python/cudf_polars/tests/experimental/legacy/test_distributed.py
 delete mode 100644 python/cudf_polars/tests/experimental/legacy/test_explain.py
 delete mode 100644 python/cudf_polars/tests/experimental/legacy/test_parallel.py
 delete mode 100644 python/cudf_polars/tests/experimental/legacy/test_shuffle.py
 delete mode 100644 python/cudf_polars/tests/experimental/legacy/test_shuffler.py
 delete mode 100644 python/cudf_polars/tests/experimental/legacy/test_sort.py
 delete mode 100644 python/cudf_polars/tests/experimental/test_dask_serialize.py
 delete mode 100644 python/cudf_polars/tests/experimental/test_dask_sizeof.py
 delete mode 100644 python/cudf_polars/tests/experimental/test_dask_tokenize.py

diff --git a/ci/run_cudf_polars_experimental_pytests.sh b/ci/run_cudf_polars_experimental_pytests.sh
index ea0fe69f37c..d0a4767bd99 100755
--- a/ci/run_cudf_polars_experimental_pytests.sh
+++ b/ci/run_cudf_polars_experimental_pytests.sh
@@ -11,10 +11,4 @@ set -euo pipefail
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/
 
 echo "Running the full cudf-polars test suite with both the in-memory and spmd engine"
-python -m pytest --cache-clear "$@" tests --ignore=tests/experimental/legacy
-
-echo "Running experimental legacy tests with the 'rapidsmpf' runtime and a 'distributed' cluster"
-python -m pytest --cache-clear "$@" "tests/experimental/legacy" \
-    --executor streaming \
-    --cluster distributed \
-    --runtime rapidsmpf
+python -m pytest --cache-clear "$@" tests
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index b06ba2d770f..fb915784f96 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -313,10 +313,8 @@ def _callback(
             if timer is not None:
                 msg = textwrap.dedent("""\
                     LazyFrame.profile() is not supported with the streaming executor.
-                    To profile execution with the streaming executor, use:
-
-                    - NVIDIA NSight Systems with the 'streaming' scheduler.
-                    - Dask's built-in profiling tools with the 'distributed' scheduler.
+                    To profile execution with the streaming executor, use NVIDIA
+                    NSight Systems with the 'streaming' scheduler.
                     """)
                 raise NotImplementedError(msg)
 
@@ -368,12 +366,6 @@ def execute_with_cudf(
         if timer is not None:
             timer.store(start, time.monotonic_ns(), "gpu-ir-translation")
 
-        if (
-            memory_resource is None
-            and translator.config_options.executor.name == "streaming"
-            and translator.config_options.executor.cluster == "distributed"
-        ):  # pragma: no cover; Requires distributed cluster
-            memory_resource = rmm.mr.get_current_device_resource()
         if len(ir_translation_errors):
             # TODO: Display these errors in user-friendly way.
             # tracked in https://github.com/rapidsai/cudf/issues/17051
diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py
index fce2966e7da..8591ed18cdd 100644
--- a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py
+++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils.py
@@ -1,56 +1,19 @@
 # SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
-"""
-Benchmark utilities - forwarding shim.
-
-Dispatches to ``utils_new_frontends`` when ``--frontend`` appears in ``sys.argv``,
-otherwise falls back to ``utils_legacy``.
-"""
+"""Benchmark utilities."""
 
 from __future__ import annotations
 
-import sys
-
-
-def _use_new_frontend() -> bool:
-    # HACK: Inspect sys.argv to detect use of the new frontends
-    # (e.g. ``--frontend ray``) without full argument parsing.
-    # This only works when invoked from the CLI; direct imports always get the
-    # legacy path. TODO: Remove this shim once the legacy path is deleted.
-    args = sys.argv[1:]
-    has_frontend = "--frontend" in args
-    has_cluster = "--cluster" in args or "-c" in args
-    if has_frontend and has_cluster:
-        raise SystemExit(
-            "Error: --frontend and --cluster cannot be used together.\n"
-            "  Use --frontend <ray|spmd|dask> for the new frontend path.\n"
-            "  Use --cluster <single|distributed> for the legacy path."
-        )
-    return has_frontend
-
-
-if _use_new_frontend():
-    from cudf_polars.experimental.benchmarks.utils_new_frontends import (
-        COUNT_DTYPE,
-        QueryResult,
-        RunConfig,
-        build_parser,
-        get_data,
-        parse_args,
-        run_duckdb,
-        run_polars,
-    )
-else:
-    from cudf_polars.experimental.benchmarks.utils_legacy import (  # type: ignore[assignment]
-        COUNT_DTYPE,
-        QueryResult,
-        RunConfig,
-        build_parser,
-        get_data,
-        parse_args,
-        run_duckdb,
-        run_polars,
-    )
+from cudf_polars.experimental.benchmarks.utils_new_frontends import (
+    COUNT_DTYPE,
+    QueryResult,
+    RunConfig,
+    build_parser,
+    get_data,
+    parse_args,
+    run_duckdb,
+    run_polars,
+)
 
 __all__: list[str] = [
     "COUNT_DTYPE",
diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils_legacy.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils_legacy.py
deleted file mode 100644
index 2d18b2747b5..00000000000
--- a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils_legacy.py
+++ /dev/null
@@ -1,2100 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Utility functions/classes for running the PDS-H and PDS-DS benchmarks."""
-
-from __future__ import annotations
-
-import argparse
-import dataclasses
-import importlib
-import io
-import itertools
-import json
-import logging
-import os
-import pprint
-import statistics
-import sys
-import textwrap
-import time
-import traceback
-import uuid
-from collections import defaultdict
-from datetime import UTC, datetime
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal, assert_never
-
-import nvtx
-
-import polars as pl
-
-import rmm.statistics
-
-# The dtype for count() aggregations depends on the presence
-# of the polars-runtime-64 package (`polars[rt64]`).
-HAS_POLARS_RT_64 = pl.config.plr.RUNTIME_REPR == "rt64"
-COUNT_DTYPE = pl.UInt64() if HAS_POLARS_RT_64 else pl.UInt32()
-
-try:
-    import duckdb
-
-    duckdb_err = None
-except ImportError as e:
-    duckdb = None
-    duckdb_err = e
-
-try:
-    import pynvml
-except ImportError:
-    pynvml = None
-
-try:
-    from cudf_polars.dsl.ir import IRExecutionContext
-    from cudf_polars.dsl.tracing import Scope
-    from cudf_polars.dsl.translate import Translator
-    from cudf_polars.experimental.benchmarks.asserts import (
-        ValidationError,
-        assert_tpch_result_equal,
-    )
-    from cudf_polars.experimental.explain import explain_query
-    from cudf_polars.experimental.parallel import evaluate_streaming
-    from cudf_polars.utils.config import ConfigOptions
-
-    CUDF_POLARS_AVAILABLE = True
-except ImportError:
-    CUDF_POLARS_AVAILABLE = False
-
-if TYPE_CHECKING:
-    from collections.abc import Callable, Sequence
-
-    from cudf_polars.experimental.explain import SerializablePlan
-
-
-POLARS_VALIDATION_OPTIONS = {
-    "check_row_order": True,
-    "check_column_order": True,
-    "check_dtypes": True,
-    "check_exact": False,
-    "rel_tol": 1e-5,
-    "abs_tol": 1e-2,
-}
-
-
-def get_validation_options(args: Any) -> dict[str, Any]:
-    """Get validation options dict from parsed arguments."""
-    return {
-        **POLARS_VALIDATION_OPTIONS,
-        "abs_tol": args.validation_abs_tol,
-    }
-
-
-try:
-    import structlog
-    import structlog.contextvars
-    import structlog.processors
-    import structlog.stdlib
-except ImportError:
-    _HAS_STRUCTLOG = False
-else:
-    _HAS_STRUCTLOG = True
-
-
-ExecutorType = Literal["in-memory", "streaming", "cpu"]
-
-
-@dataclasses.dataclass
-class ValidationResult:
-    """
-    Result of a validation run.
-
-    Parameters
-    ----------
-    status
-        The status of the validation. Either 'Passed' or 'Failed'.
-    message
-        The message from the validation. This should be ``None`` if
-        the validation passed, and a string describing the failure otherwise.
-    details
-        Additional details about the validation failure.
-    """
-
-    status: Literal["Passed", "Failed"]
-    message: str | None
-    details: dict[str, Any] | None = None
-
-    @classmethod
-    def from_error(cls, error: Exception) -> ValidationResult:
-        """
-        Create a ValidationResult from some exception.
-
-        Parameters
-        ----------
-        error : Exception
-            The error to create a ValidationResult from.
-
-            This will correctly propagate "message" and "details" from
-            ``cudf_polars.testing.asserts.ValidationError``.
-
-        Returns
-        -------
-        ValidationResult
-            The ValidationResult created from the error.
-        """
-        match error:
-            case ValidationError(message=message, details=details):
-                return cls(status="Failed", message=message, details=details)
-            case _:
-                return cls(status="Failed", message=str(error))
-
-
-@dataclasses.dataclass
-class ValidationMethod:
-    """
-    Information about how the validation was performed.
-
-    Parameters
-    ----------
-    expected_source
-        A name indicating the source of the expected results.
-
-        - 'polars-cpu': Run polars against the same data
-        - 'duckdb': Compare against pre-computed DuckDB results
-
-    comparison_method
-        How the comparison was performed. Currently, only
-        'polars' is supported, which indicates that ``polars.testing.assert_frame_equal``
-        was used.
-
-    comparison_options
-        Additional options passed to the comparison method, controlling
-        things like the tolerance for floating point comparisons.
-    """
-
-    expected_source: Literal["polars-cpu", "duckdb"]
-    comparison_method: Literal["polars"]
-    comparison_options: dict[str, Any]
-
-
-@dataclasses.dataclass(kw_only=True)
-class FailedRecord:
-    """Records a failed query iteration."""
-
-    query: int
-    iteration: int
-    status: Literal["error"] = "error"
-    traceback: str
-
-
-@dataclasses.dataclass(kw_only=True)
-class SuccessRecord:
-    """Results for a single run of a single PDS-H query."""
-
-    query: int
-    iteration: int
-    duration: float
-    shuffle_stats: dict[str, dict[str, int | float]] | None = None
-    traces: list[dict[str, Any]] | None = None
-    validation_result: ValidationResult | None = None
-    status: Literal["success"] = "success"
-
-    @classmethod
-    def new(
-        cls,
-        query: int,
-        iteration: int,
-        duration: float,
-        shuffle_stats: dict[str, dict[str, int | float]] | None = None,
-        traces: list[dict[str, Any]] | None = None,
-    ) -> SuccessRecord:
-        """Create a Record from plain data."""
-        return cls(
-            query=query,
-            iteration=iteration,
-            duration=duration,
-            shuffle_stats=shuffle_stats,
-            traces=traces,
-        )
-
-
-@dataclasses.dataclass
-class QueryRunResult:
-    """Result of running a single query (all iterations)."""
-
-    query_records: list[SuccessRecord | FailedRecord]
-    plan: SerializablePlan | None
-    iteration_failures: list[tuple[int, int]]
-    validation_failed: bool
-
-
-@dataclasses.dataclass
-class VersionInfo:
-    """Information about the commit of the software used to run the query."""
-
-    version: str
-    commit: str
-
-
-@dataclasses.dataclass
-class PackageVersions:
-    """Information about the versions of the software used to run the query."""
-
-    cudf_polars: str | VersionInfo
-    polars: str
-    python: str
-    rapidsmpf: str | VersionInfo | None
-    duckdb: str | None
-
-    @classmethod
-    def collect(cls) -> PackageVersions:
-        """Collect the versions of the software used to run the query."""
-        packages = [
-            "cudf_polars",
-            "duckdb",
-            "polars",
-            "rapidsmpf",
-        ]
-        versions: dict[str, str | VersionInfo | None] = {}
-        for name in packages:
-            try:
-                package = importlib.import_module(name)
-            except (AttributeError, ImportError):
-                versions[name] = None
-            else:
-                if name in ("cudf_polars", "rapidsmpf"):
-                    versions[name] = VersionInfo(
-                        version=package.__version__,
-                        commit=package.__git_commit__,
-                    )
-                else:
-                    versions[name] = package.__version__
-
-        versions["python"] = ".".join(str(v) for v in sys.version_info[:3])
-        # we manually ensure that only cudf-polars and rapidsmpf have a VersionInfo
-        return cls(**versions)  # type: ignore[arg-type]
-
-
-@dataclasses.dataclass
-class GPUInfo:
-    """Information about a specific GPU."""
-
-    name: str
-    index: int
-    free_memory: int | None
-    used_memory: int | None
-    total_memory: int | None
-
-    @classmethod
-    def from_index(cls, index: int) -> GPUInfo:
-        """Create a GPUInfo from an index."""
-        pynvml.nvmlInit()
-        handle = pynvml.nvmlDeviceGetHandleByIndex(index)
-        try:
-            memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
-            return cls(
-                name=pynvml.nvmlDeviceGetName(handle),
-                index=index,
-                free_memory=memory.free,
-                used_memory=memory.used,
-                total_memory=memory.total,
-            )
-        except pynvml.NVMLError_NotSupported:
-            # Happens on systems without traditional GPU memory (e.g., Grace Hopper),
-            # where nvmlDeviceGetMemoryInfo is not supported.
-            # See: https://github.com/rapidsai/cudf/issues/19427
-            return cls(
-                name=pynvml.nvmlDeviceGetName(handle),
-                index=index,
-                free_memory=None,
-                used_memory=None,
-                total_memory=None,
-            )
-
-
-@dataclasses.dataclass
-class HardwareInfo:
-    """Information about the hardware used to run the query."""
-
-    gpus: list[GPUInfo]
-    # TODO: ucx
-
-    @classmethod
-    def collect(cls) -> HardwareInfo:
-        """Collect the hardware information."""
-        if pynvml is not None:
-            pynvml.nvmlInit()
-            gpus = [GPUInfo.from_index(i) for i in range(pynvml.nvmlDeviceGetCount())]
-        else:
-            # No GPUs -- probably running in CPU mode
-            gpus = []
-        return cls(gpus=gpus)
-
-
-def _infer_scale_factor(name: str, path: str | Path, suffix: str) -> int | float:
-    if "pdsh" in name:
-        supplier = get_data(path, "supplier", suffix)
-        num_rows = supplier.select(pl.len()).collect().item(0, 0)
-        return num_rows / 10_000
-
-    elif "pdsds" in name:
-        # TODO: Keep a map of SF-row_count because of nonlinear scaling
-        # See: https://www.tpc.org/TPC_Documents_Current_Versions/pdf/TPC-DS_v4.0.0.pdf pg.46
-        customer = get_data(path, "promotion", suffix)
-        num_rows = customer.select(pl.len()).collect().item(0, 0)
-        return num_rows / 300
-
-    else:
-        raise ValueError(f"Invalid benchmark script name: '{name}'.")
-
-
-@dataclasses.dataclass(kw_only=True)
-class RunConfig:
-    """Results for a PDS-H or PDS-DS query run."""
-
-    engine_name: Literal["polars-cpu", "cudf-polars", "duckdb"]
-    queries: list[int]
-    suffix: str
-    executor: ExecutorType
-    runtime: str
-    stream_policy: str | None
-    cluster: str
-    n_workers: int
-    versions: PackageVersions = dataclasses.field(
-        default_factory=PackageVersions.collect
-    )
-    records: dict[int, list[SuccessRecord | FailedRecord]] = dataclasses.field(
-        default_factory=dict
-    )
-    plans: dict[int, SerializablePlan] = dataclasses.field(default_factory=dict)
-    dataset_path: Path
-    scale_factor: int | float
-    qualification: bool = False
-    shuffle: Literal["rapidsmpf", "tasks"] | None = None
-    gather_shuffle_stats: bool = False
-    broadcast_join_limit: int | None = None
-    blocksize: int | None = None
-    max_rows_per_partition: int | None = None
-    threads: int
-    iterations: int
-    timestamp: str = dataclasses.field(
-        default_factory=lambda: datetime.now(UTC).isoformat()
-    )
-    hardware: HardwareInfo = dataclasses.field(default_factory=HardwareInfo.collect)
-    run_id: uuid.UUID = dataclasses.field(default_factory=uuid.uuid4)
-    rmm_async: bool
-    rapidsmpf_oom_protection: bool
-    rapidsmpf_spill: bool
-    spill_device: float
-    query_set: str
-    collect_traces: bool = False
-    dynamic_planning: bool | None = None
-    max_io_threads: int
-    native_parquet: bool
-    spill_to_pinned_memory: bool
-    extra_info: dict[str, Any] = dataclasses.field(default_factory=dict)
-    fallback_mode: str | None = None
-    validation_method: ValidationMethod | None = None
-    io_mode: Literal["cold", "lukewarm", "hot"] = "lukewarm"
-    duckdb_threads: int | None = None
-    duckdb_memory_limit: str | None = None
-    duckdb_temp_dir: str | None = None
-
-    def __post_init__(self) -> None:  # noqa: D105
-        if self.gather_shuffle_stats and self.shuffle != "rapidsmpf":
-            raise ValueError(
-                "gather_shuffle_stats is only supported when shuffle='rapidsmpf'."
-            )
-        if self.io_mode == "hot" and self.iterations < 2:
-            raise ValueError(
-                "--io-mode hot requires at least 2 iterations: "
-                "iteration 0 warms the cache, iterations 1+ are the hot measurements."
-            )
-
-    @classmethod
-    def from_args(cls, args: argparse.Namespace) -> RunConfig:
-        """Create a RunConfig from command line arguments."""
-        executor: ExecutorType = args.executor
-        cluster = args.cluster
-        runtime = args.runtime
-        stream_policy = args.stream_policy
-
-        # Handle "auto" stream policy
-        if stream_policy == "auto":
-            stream_policy = None
-
-        # Deal with non-streaming executors
-        if executor == "in-memory" or executor == "cpu":
-            cluster = "single"
-
-        path = args.path
-        name = args.query_set
-        scale_factor = args.scale
-
-        if args.qualification and "pdsds" not in name:
-            raise ValueError("--qualification can only be used with PDS-DS benchmarks.")
-
-        if scale_factor is None:
-            if "pdsds" in name:
-                raise ValueError(
-                    "--scale is required for PDS-DS benchmarks.\n"
-                    "TODO: This will be inferred once we maintain a map of scale factors to row counts."
-                )
-            if path is None:
-                raise ValueError(
-                    "Must specify --root and --scale if --path is not specified."
-                )
-            # For PDS-H, infer scale factor based on row count
-            scale_factor = _infer_scale_factor(name, path, args.suffix)
-        if path is None:
-            path = f"{args.root}/scale-{scale_factor}"
-
-        scale_factor = float(scale_factor)
-        try:
-            scale_factor_int = int(scale_factor)
-        except ValueError:
-            pass
-        else:
-            if scale_factor_int == scale_factor:
-                scale_factor = scale_factor_int
-
-        skip_scale_factor_inference = (
-            "LIBCUDF_IO_REROUTE_LOCAL_DIR_PATTERN" in os.environ
-        ) and ("LIBCUDF_IO_REROUTE_REMOTE_DIR_PATTERN" in os.environ)
-
-        if (
-            "pdsh" in name
-            and args.scale is not None
-            and skip_scale_factor_inference is False
-        ):
-            # Validate the user-supplied scale factor
-            sf_inf = _infer_scale_factor(name, path, args.suffix)
-            rel_error = abs((scale_factor - sf_inf) / sf_inf)
-            if rel_error > 0.01:
-                raise ValueError(
-                    f"Specified scale factor is {args.scale}, "
-                    f"but the inferred scale factor is {sf_inf}."
-                )
-
-        if args.validate_directory:
-            validation_method = ValidationMethod(
-                expected_source="duckdb",
-                comparison_method="polars",
-                comparison_options=get_validation_options(args),
-            )
-        elif args.validate:
-            validation_method = ValidationMethod(
-                expected_source="polars-cpu" if args.baseline == "cpu" else "duckdb",
-                comparison_method="polars",
-                comparison_options=get_validation_options(args),
-            )
-        else:
-            validation_method = None
-
-        engine_name: Literal["polars-cpu", "cudf-polars", "duckdb"]
-        if args.engine == "duckdb":
-            engine_name = "duckdb"
-        elif args.engine == "polars":
-            if executor == "cpu":
-                engine_name = "polars-cpu"
-            else:
-                engine_name = "cudf-polars"
-        else:
-            raise ValueError(f"Invalid engine: {args.engine}")
-
-        return cls(
-            engine_name=engine_name,
-            queries=args.query,
-            executor=executor,
-            cluster=cluster,
-            runtime=runtime,
-            stream_policy=stream_policy,
-            n_workers=args.n_workers,
-            shuffle=args.shuffle,
-            gather_shuffle_stats=args.rapidsmpf_dask_statistics,
-            broadcast_join_limit=args.broadcast_join_limit,
-            dataset_path=path,
-            scale_factor=scale_factor,
-            qualification=args.qualification,
-            blocksize=args.blocksize,
-            threads=args.threads,
-            iterations=args.iterations,
-            suffix=args.suffix,
-            rmm_async=args.rmm_async,
-            rapidsmpf_oom_protection=args.rapidsmpf_oom_protection,
-            spill_device=args.spill_device,
-            rapidsmpf_spill=args.rapidsmpf_spill,
-            max_rows_per_partition=args.max_rows_per_partition,
-            query_set=args.query_set,
-            collect_traces=args.collect_traces,
-            dynamic_planning=args.dynamic_planning,
-            max_io_threads=args.max_io_threads,
-            native_parquet=args.native_parquet,
-            extra_info=args.extra_info,
-            spill_to_pinned_memory=args.spill_to_pinned_memory,
-            fallback_mode=args.fallback_mode,
-            validation_method=validation_method,
-            io_mode=args.io_mode,
-            duckdb_threads=args.duckdb_threads,
-            duckdb_memory_limit=args.duckdb_memory_limit,
-            duckdb_temp_dir=args.duckdb_temp_dir,
-        )
-
-    def serialize(self, engine: pl.GPUEngine | None) -> dict:
-        """Serialize the run config to a dictionary."""
-        result = dataclasses.asdict(self)
-        result["run_id"] = str(self.run_id)
-
-        if engine is not None:
-            config_options = ConfigOptions.from_polars_engine(engine)
-            result["config_options"] = dataclasses.asdict(config_options)
-        return result
-
-    def summarize(self) -> None:
-        """Print a summary of the results."""
-        print("Iteration Summary")
-        print("=======================================")
-
-        for query, records in self.records.items():
-            print(f"query: {query}")
-            print(f"path: {self.dataset_path}")
-            print(f"scale_factor: {self.scale_factor}")
-            print(f"executor: {self.executor}")
-            print(f"stream_policy: {self.stream_policy}")
-            if self.executor == "streaming":
-                print(f"runtime: {self.runtime}")
-                print(f"cluster: {self.cluster}")
-                print(f"blocksize: {self.blocksize}")
-                print(f"shuffle_method: {self.shuffle}")
-                print(f"broadcast_join_limit: {self.broadcast_join_limit}")
-                if self.runtime == "rapidsmpf":
-                    print(f"native_parquet: {self.native_parquet}")
-                    print(f"dynamic_planning: {self.dynamic_planning}")
-                if self.cluster == "distributed":
-                    print(f"n_workers: {self.n_workers}")
-                    print(f"threads: {self.threads}")
-                    print(f"rmm_async: {self.rmm_async}")
-                    print(f"rapidsmpf_oom_protection: {self.rapidsmpf_oom_protection}")
-                    print(f"spill_device: {self.spill_device}")
-                    print(f"rapidsmpf_spill: {self.rapidsmpf_spill}")
-            valid_durations = [
-                record.duration for record in records if record.status == "success"
-            ]
-            if len(valid_durations) > 0:
-                print(f"iterations: {self.iterations}")
-                print("---------------------------------------")
-                print(f"min time : {min(valid_durations):0.4f}")
-                print(f"max time : {max(valid_durations):0.4f}")
-                print(f"mean time: {statistics.mean(valid_durations):0.4f}")
-                print("=======================================")
-        any_success = any(record.status == "success" for record in records)
-
-        if any_success:
-            total_mean_time = sum(
-                statistics.mean(
-                    record.duration for record in records if record.status == "success"
-                )
-                for records in self.records.values()
-                if records
-            )
-            print(f"Total mean time across all queries: {total_mean_time:.4f} seconds")
-        else:
-            print("No successful queries")
-
-
-def get_data(path: str | Path, table_name: str, suffix: str = "") -> pl.LazyFrame:
-    """Get table from dataset."""
-    return pl.scan_parquet(f"{path}/{table_name}{suffix}")
-
-
-def get_executor_options(
-    run_config: RunConfig, benchmark: Any = None
-) -> dict[str, Any]:
-    """Generate executor_options for GPUEngine."""
-    executor_options: dict[str, Any] = {}
-
-    if run_config.executor == "streaming":
-        if run_config.blocksize:
-            executor_options["target_partition_size"] = run_config.blocksize
-        if run_config.max_rows_per_partition:
-            executor_options["max_rows_per_partition"] = (
-                run_config.max_rows_per_partition
-            )
-        if run_config.shuffle:
-            executor_options["shuffle_method"] = run_config.shuffle
-        if run_config.broadcast_join_limit:
-            executor_options["broadcast_join_limit"] = run_config.broadcast_join_limit
-        if run_config.rapidsmpf_spill:
-            executor_options["rapidsmpf_spill"] = run_config.rapidsmpf_spill
-        if run_config.fallback_mode:
-            executor_options["fallback_mode"] = run_config.fallback_mode
-        if run_config.cluster == "distributed":
-            executor_options["cluster"] = "distributed"
-        executor_options["client_device_threshold"] = run_config.spill_device
-        executor_options["runtime"] = run_config.runtime
-        executor_options["max_io_threads"] = run_config.max_io_threads
-        executor_options["spill_to_pinned_memory"] = run_config.spill_to_pinned_memory
-        if not run_config.dynamic_planning:
-            # Disable dynamic planning
-            executor_options["dynamic_planning"] = None
-
-    if (
-        benchmark
-        and benchmark.__name__ == "PDSHQueries"
-        and run_config.executor == "streaming"
-        and not run_config.dynamic_planning
-    ):
-        executor_options["unique_fraction"] = {
-            "c_custkey": 0.05,
-            "l_orderkey": 1.0,
-            "l_partkey": 0.1,
-            "o_custkey": 0.25,
-        }
-
-    return executor_options
-
-
-def print_query_plan(
-    q_id: int,
-    q: pl.LazyFrame,
-    args: argparse.Namespace,
-    run_config: RunConfig,
-    engine: None | pl.GPUEngine = None,
-    *,
-    print_plans: bool = True,
-) -> tuple[str | None, str | None]:
-    """Print the query plan."""
-    logical_plan = plan = None
-    if run_config.executor == "cpu":
-        if args.explain_logical:
-            logical_plan = q.explain()
-        if args.explain:
-            plan = q.show_graph(engine="streaming", plan_stage="physical")
-    elif CUDF_POLARS_AVAILABLE:
-        assert isinstance(engine, pl.GPUEngine)
-        if args.explain_logical:
-            logical_plan = explain_query(q, engine, physical=False)
-        if args.explain and run_config.executor == "streaming":
-            plan = explain_query(q, engine)
-    else:
-        raise RuntimeError(
-            "Cannot provide the logical or physical plan because cudf_polars is not installed."
-        )
-
-    if print_plans:
-        if logical_plan:
-            print(f"\nQuery {q_id} - Logical plan\n")
-            print(logical_plan)
-        if plan:
-            print(f"\nQuery {q_id} - Physical plan\n")
-            print(plan)
-
-    return logical_plan, plan
-
-
-def initialize_dask_cluster(run_config: RunConfig, args: argparse.Namespace):  # type: ignore[no-untyped-def]
-    """
-    Initialize a Dask distributed cluster.
-
-    This function either creates a new LocalCUDACluster or connects to an
-    existing Dask cluster depending on the provided arguments.
-
-    Parameters
-    ----------
-    run_config : RunConfig
-        The run configuration.
-    args : argparse.Namespace
-        Parsed command line arguments. If ``args.scheduler_address`` or
-        ``args.scheduler_file`` is provided, we connect to an existing
-        cluster instead of creating a LocalCUDACluster.
-
-    Returns
-    -------
-    Client or None
-        A Dask distributed Client, or None if not using distributed mode.
-    """
-    if run_config.cluster != "distributed":
-        return None
-
-    from distributed import Client
-
-    # Check if we should connect to an existing cluster
-    scheduler_address = args.scheduler_address
-    scheduler_file = args.scheduler_file
-
-    if scheduler_address is not None:
-        # Connect to existing cluster via scheduler address
-        client = Client(address=scheduler_address)
-        n_workers = client.scheduler_info()["n_workers"]
-        print(
-            f"Connected to existing Dask cluster at {scheduler_address} "
-            f"with {n_workers} workers"
-        )
-    elif scheduler_file is not None:
-        # Connect to existing cluster via scheduler file
-        client = Client(scheduler_file=scheduler_file)
-        n_workers = client.scheduler_info()["n_workers"]
-        print(
-            f"Connected to existing Dask cluster via scheduler file: {scheduler_file} "
-            f"with {n_workers} workers"
-        )
-    else:
-        # Create a new LocalCUDACluster
-        from dask_cuda import LocalCUDACluster
-
-        kwargs = {
-            "n_workers": run_config.n_workers,
-            "dashboard_address": ":8585",
-            "protocol": args.protocol,
-            "rmm_pool_size": args.rmm_pool_size,
-            "rmm_async": args.rmm_async,
-            "rmm_release_threshold": args.rmm_release_threshold,
-            "threads_per_worker": run_config.threads,
-            "memory_limit": args.worker_memory_limit,
-        }
-
-        client = Client(LocalCUDACluster(**kwargs))
-        client.wait_for_workers(run_config.n_workers)
-
-    if run_config.shuffle != "tasks":
-        try:
-            from rapidsmpf.config import Options, get_environment_variables
-            from rapidsmpf.integrations.dask import bootstrap_dask_cluster
-
-            bootstrap_dask_cluster(
-                client,
-                options=Options(
-                    {
-                        "dask_spill_device": str(run_config.spill_device),
-                        "dask_spill_to_pinned_memory": str(
-                            run_config.spill_to_pinned_memory
-                        ),
-                        "dask_statistics": str(args.rapidsmpf_dask_statistics),
-                        "dask_print_statistics": str(args.rapidsmpf_print_statistics),
-                        "dask_oom_protection": str(args.rapidsmpf_oom_protection),
-                    }
-                    | get_environment_variables()
-                ),
-            )
-            # Setting this globally makes the peak statistics not meaningful
-            # across queries / iterations. But doing it per query isn't worth
-            # the effort right now.
-            client.run(rmm.statistics.enable_statistics)
-        except ImportError as err:
-            if run_config.shuffle == "rapidsmpf":
-                raise ImportError(
-                    "rapidsmpf is required for shuffle='rapidsmpf' but is not installed."
-                ) from err
-
-    return client
-
-
-def drop_file_page_cache_recursively(path: os.PathLike | str) -> None:
-    """Drop the Linux page cache for all files under `path`."""
-    try:
-        import kvikio
-    except ImportError as err:
-        raise RuntimeError(
-            "kvikio is required for cold-run page cache dropping. "
-            "Install it or switch to --io-mode lukewarm."
-        ) from err
-    p = Path(path).expanduser()
-    if p.is_file():
-        kvikio.drop_file_page_cache(p)
-        return
-    for f in p.rglob("*"):
-        if f.is_file():
-            kvikio.drop_file_page_cache(f)
-
-
-def execute_query(
-    q_id: int,
-    i: int,
-    q: pl.LazyFrame,
-    run_config: RunConfig,
-    args: argparse.Namespace,
-    engine: None | pl.GPUEngine = None,
-) -> tuple[pl.DataFrame, float]:
-    """Execute a query with NVTX annotation."""
-    if run_config.io_mode == "cold":
-        drop_file_page_cache_recursively(run_config.dataset_path)
-
-    with nvtx.annotate(
-        message=f"Query {q_id} - Iteration {i}",
-        domain="cudf_polars",
-        color="green",
-    ):
-        if run_config.executor == "cpu":
-            t0 = time.monotonic()
-            result = q.collect(engine="streaming")
-            t1 = time.monotonic()
-
-        elif CUDF_POLARS_AVAILABLE:
-            assert isinstance(engine, pl.GPUEngine)
-            if args.debug:
-                translator = Translator(q._ldf.visit(), engine)
-                ir = translator.translate_ir()
-                context = IRExecutionContext()
-                if run_config.executor == "in-memory":
-                    t0 = time.monotonic()
-                    result = ir.evaluate(
-                        cache={}, timer=None, context=context
-                    ).to_polars()
-                    t1 = time.monotonic()
-                elif run_config.executor == "streaming":
-                    t0 = time.monotonic()
-                    result = evaluate_streaming(
-                        ir,
-                        translator.config_options,
-                    )
-                    t1 = time.monotonic()
-                else:
-                    assert_never(run_config.executor)
-            else:
-                t0 = time.monotonic()
-                result = q.collect(engine=engine)
-                t1 = time.monotonic()
-
-        else:
-            raise RuntimeError("The requested engine is not supported.")
-
-        return result, t1 - t0
-
-
-def _query_type(num_queries: int) -> Callable[[str | int], list[int]]:
-    def parse(query: str | int) -> list[int]:
-        if isinstance(query, int):
-            return [query]
-        if query == "all":
-            return list(range(1, num_queries + 1))
-
-        result: set[int] = set()
-        for part in query.split(","):
-            if "-" in part:
-                start, end = part.split("-")
-                result.update(range(int(start), int(end) + 1))
-            else:
-                result.add(int(part))
-        return sorted(result)
-
-    return parse
-
-
-def build_parser(num_queries: int = 22) -> argparse.ArgumentParser:
-    """Build the argument parser for PDS-H/PDS-DS benchmarks."""
-    parser = argparse.ArgumentParser(
-        prog="Cudf-Polars PDS-H Benchmarks",
-        description="Experimental streaming-executor benchmarks.",
-        formatter_class=argparse.RawTextHelpFormatter,
-    )
-    parser.add_argument(
-        "query",
-        type=_query_type(num_queries),
-        help=textwrap.dedent("""\
-            Query to run. One of the following:
-            - A single number (e.g. 11)
-            - A comma-separated list of query numbers (e.g. 1,3,7)
-            - A range of query number (e.g. 1-11,23-34)
-            - The string 'all' to run all queries (1 through 22)"""),
-    )
-    parser.add_argument(
-        "--path",
-        type=str,
-        default=os.environ.get("PDSH_DATASET_PATH"),
-        help=textwrap.dedent("""\
-            Path to the root directory of the PDS-H dataset.
-            Defaults to the PDSH_DATASET_PATH environment variable."""),
-    )
-    parser.add_argument(
-        "--root",
-        type=str,
-        default=os.environ.get("PDSH_DATASET_ROOT"),
-        help="Root PDS-H dataset directory (ignored if --path is used).",
-    )
-    parser.add_argument(
-        "--scale",
-        type=str,
-        default=None,
-        help="Dataset scale factor.",
-    )
-    parser.add_argument(
-        "--qualification",
-        action="store_true",
-        help="Use TPC-DS qualification parameters from specification Appendix B (PDS-DS only).",
-    )
-    parser.add_argument(
-        "--suffix",
-        type=str,
-        default=".parquet",
-        help=textwrap.dedent("""\
-            File suffix for input table files.
-            Default: .parquet"""),
-    )
-    parser.add_argument(
-        "-e",
-        "--executor",
-        default="streaming",
-        type=str,
-        choices=["in-memory", "streaming", "cpu"],
-        help=textwrap.dedent("""\
-            Query executor backend:
-                - in-memory : Evaluate query in GPU memory
-                - streaming : Partitioned evaluation (default)
-                - cpu       : Use Polars CPU engine"""),
-    )
-    parser.add_argument(
-        "-c",
-        "--cluster",
-        default=None,
-        type=str,
-        choices=["single", "distributed"],
-        help=textwrap.dedent("""\
-            Cluster type to use with the 'streaming' executor.
-                - single      : Run locally in a single process
-                - distributed : Use Dask for multi-GPU execution"""),
-    )
-    parser.add_argument(
-        "--runtime",
-        type=str,
-        choices=["tasks", "rapidsmpf"],
-        default="tasks",
-        help="Runtime to use for the streaming executor (tasks or rapidsmpf).",
-    )
-    parser.add_argument(
-        "--stream-policy",
-        type=str,
-        choices=["auto", "default", "new", "pool"],
-        default="auto",
-        help=textwrap.dedent("""\
-            CUDA stream policy (auto, default, new, pool).
-            Default: auto (use the default policy for the runtime)"""),
-    )
-    parser.add_argument(
-        "--n-workers",
-        default=1,
-        type=int,
-        help="Number of Dask-CUDA workers (requires 'distributed' cluster).",
-    )
-    external_cluster_group = parser.add_mutually_exclusive_group()
-    external_cluster_group.add_argument(
-        "--scheduler-address",
-        default=None,
-        type=str,
-        help=textwrap.dedent("""\
-            Scheduler address for connecting to an existing Dask cluster.
-            If provided, a cluster is not created and worker
-            configuration options (--n-workers, --rmm-pool-size, etc.)
-            are ignored since the workers are assumed to be started separately."""),
-    )
-    external_cluster_group.add_argument(
-        "--scheduler-file",
-        default=None,
-        type=str,
-        help=textwrap.dedent("""\
-            Path to a scheduler file for connecting to an existing Dask cluster.
-            If provided, a cluster is not created and worker
-            configuration options (--n-workers, --rmm-pool-size, etc.)
-            are ignored since the workers are assumed to be started separately."""),
-    )
-    parser.add_argument(
-        "--blocksize",
-        default=None,
-        type=int,
-        help="Target partition size, in bytes, for IO tasks.",
-    )
-    parser.add_argument(
-        "--max-rows-per-partition",
-        default=None,
-        type=int,
-        help="The maximum number of rows to process per partition.",
-    )
-    parser.add_argument(
-        "--iterations",
-        default=1,
-        type=int,
-        help="Number of times to run the same query.",
-    )
-    parser.add_argument(
-        "--io-mode",
-        dest="io_mode",
-        default="lukewarm",
-        choices=["cold", "lukewarm", "hot"],
-        help=textwrap.dedent("""\
-            Cache state control for each timed iteration:
-                - cold     : Drop Linux page cache before each iteration (requires kvikio)
-                - lukewarm : No cache manipulation; OS cache state unchanged (default)
-                - hot      : One untimed warmup iteration to populate cache before measured runs"""),
-    )
-    parser.add_argument(
-        "--debug",
-        default=False,
-        action="store_true",
-        help="Debug run.",
-    )
-    parser.add_argument(
-        "--protocol",
-        default="ucx",
-        type=str,
-        choices=["ucx"],
-        help="Communication protocol to use for Dask: ucx (uses ucxx)",
-    )
-    parser.add_argument(
-        "--shuffle",
-        default=None,
-        type=str,
-        choices=[None, "rapidsmpf", "tasks"],
-        help="Shuffle method to use for distributed execution.",
-    )
-    parser.add_argument(
-        "--broadcast-join-limit",
-        default=None,
-        type=int,
-        help="Set an explicit `broadcast_join_limit` option.",
-    )
-    parser.add_argument(
-        "--threads",
-        default=1,
-        type=int,
-        help="Number of threads to use on each GPU.",
-    )
-    parser.add_argument(
-        "--rmm-pool-size",
-        default=None,
-        type=float,
-        help=textwrap.dedent("""\
-            Fraction of total GPU memory to allocate for RMM pool.
-            Default: 0.5 (50%% of GPU memory) when --no-rmm-async,
-                     None when --rmm-async"""),
-    )
-    parser.add_argument(
-        "--rmm-release-threshold",
-        default=None,
-        type=float,
-        help=textwrap.dedent("""\
-            Passed to dask_cuda.LocalCUDACluster or CudaAsyncMemoryResource
-            to control the release threshold for RMM pool memory.
-            Default: None (no release threshold)"""),
-    )
-    parser.add_argument(
-        "--worker-memory-limit",
-        default="auto",
-        type=str,
-        help=textwrap.dedent("""\
-            Passed to dask_cuda.LocalCUDACluster to control the memory limit
-            of each Dask worker. Use 'auto' to let Dask determine the limit
-            automatically, or '0' for unlimited.
-            Default: auto"""),
-    )
-    parser.add_argument(
-        "--rmm-async",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help="Use RMM async memory resource. Note: only affects distributed cluster!",
-    )
-    parser.add_argument(
-        "--rapidsmpf-oom-protection",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help="Use rapidsmpf CUDA managed memory-based OOM protection.",
-    )
-    parser.add_argument(
-        "--rapidsmpf-dask-statistics",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help="Collect rapidsmpf shuffle statistics. The output will be stored in the 'shuffle_stats' field of each record.",
-    )
-    parser.add_argument(
-        "--rapidsmpf-print-statistics",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help="Print rapidsmpf shuffle statistics on each Dask worker upon completion.",
-    )
-    parser.add_argument(
-        "--rapidsmpf-spill",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help="Use rapidsmpf for general spilling.",
-    )
-    parser.add_argument(
-        "--spill-device",
-        default=0.5,
-        type=float,
-        help="Rapidsmpf device spill threshold.",
-    )
-    parser.add_argument(
-        "-o",
-        "--output",
-        type=argparse.FileType("at"),
-        default="pdsh_results.jsonl",
-        help="Output file path.",
-    )
-    parser.add_argument(
-        "--summarize",
-        action=argparse.BooleanOptionalAction,
-        help="Summarize the results.",
-        default=True,
-    )
-    parser.add_argument(
-        "--print-results",
-        action=argparse.BooleanOptionalAction,
-        help="Print the query results",
-        default=True,
-    )
-    parser.add_argument(
-        "--explain",
-        action=argparse.BooleanOptionalAction,
-        help="Print an outline of the physical plan",
-        default=False,
-    )
-    parser.add_argument(
-        "--explain-logical",
-        action=argparse.BooleanOptionalAction,
-        help="Print an outline of the logical plan",
-        default=False,
-    )
-    parser.add_argument(
-        "--print-plans",
-        action=argparse.BooleanOptionalAction,
-        help="Print the query plans",
-        default=True,
-    )
-    parser.add_argument(
-        "--validate",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help=(
-            "Validate the result against CPU execution. This will "
-            "run the query with both GPU and baseline engine (CPU polars or DuckDB), collect the "
-            "results in memory, and compare them using polars'. "
-            "At larger scale factors, computing the expected result can be slow so "
-            "--validate-directory should be used instead."
-        ),
-    )
-    parser.add_argument(
-        "--baseline",
-        choices=["duckdb", "cpu"],
-        default="duckdb",
-        help="Which engine to use as the baseline for validation.",
-    )
-
-    parser.add_argument(
-        "--collect-traces",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help="Collect data tracing cudf-polars execution.",
-    )
-
-    parser.add_argument(
-        "--dynamic-planning",
-        action=argparse.BooleanOptionalAction,
-        default=True,
-        help="Enable dynamic physical-plan generation. Only available for the 'rapidsmpf' runtime.",
-    )
-    parser.add_argument(
-        "--max-io-threads",
-        default=2,
-        type=int,
-        help="Maximum number of IO threads for rapidsmpf runtime.",
-    )
-    parser.add_argument(
-        "--native-parquet",
-        action=argparse.BooleanOptionalAction,
-        default=False,
-        help="Use C++ read_parquet nodes for the rapidsmpf runtime.",
-    )
-    parser.add_argument(
-        "--results-directory",
-        type=Path,
-        default=None,
-        help="Optional directory to write query results as parquet files.",
-    )
-    parser.add_argument(
-        "--output-expected-directory",
-        type=Path,
-        default=None,
-        help="Optional directory to write expected results as parquet files, when computed from CPU-polars or DuckDB.",
-    )
-    parser.add_argument(
-        "--validate-directory",
-        type=Path,
-        default=None,
-        help=(
-            "Validate the results against a directory with a pre-computed set of 'golden' results. "
-            "The directory should contain one parquet file per query, named 'qDD.parquet', where DD is the "
-            "zero-padded query number. The JSON output will include the validation results for each record."
-        ),
-    )
-    parser.add_argument(
-        "--validation-abs-tol",
-        type=float,
-        default=0.01,
-        help="Absolute tolerance for assert_frame_equal validation. Default: 0.01",
-    )
-    parser.add_argument(
-        "--spill-to-pinned-memory",
-        action=argparse.BooleanOptionalAction,
-        default=True,
-        help=textwrap.dedent("""\
-            Whether RapidsMPF should spill to pinned host memory when available,
-            or use regular pageable host memory."""),
-    )
-    parser.add_argument(
-        "--extra-info",
-        type=json.loads,
-        default={},
-        help="Extra information to add to the output file (e.g. version information). Must be JSON-serializable.",
-    )
-    parser.add_argument(
-        "--fallback-mode",
-        type=str,
-        choices=["warn", "raise", "silent"],
-        default=None,
-        help=textwrap.dedent("""\
-            How to handle operations that don't support multiple partitions in streaming executor.
-            - warn   : Emit a warning and fall back to single partition (default)
-            - raise  : Raise an exception
-            - silent : Silently fall back to single partition"""),
-    )
-    parser.add_argument(
-        "--duckdb-threads",
-        type=int,
-        default=None,
-        help="Number of threads for DuckDB to use. Defaults to os.cpu_count().",
-    )
-    parser.add_argument(
-        "--duckdb-memory-limit",
-        type=str,
-        default=None,
-        help="DuckDB memory limit (e.g. '500GB'). If unset, DuckDB uses its default.",
-    )
-    parser.add_argument(
-        "--duckdb-temp-dir",
-        type=str,
-        default=None,
-        help="Directory for DuckDB to spill temporary data to disk.",
-    )
-
-    return parser
-
-
-def parse_args(
-    args: Sequence[str] | None = None,
-    num_queries: int = 22,
-    parser: argparse.ArgumentParser | None = None,
-) -> argparse.Namespace:
-    """Parse command line arguments."""
-    if parser is None:
-        parser = build_parser(num_queries)
-    parsed_args = parser.parse_args(args)
-
-    if parsed_args.rmm_pool_size is None and not parsed_args.rmm_async:
-        # The default rmm pool size depends on the rmm_async flag
-        parsed_args.rmm_pool_size = 0.5
-
-    if parsed_args.validate_directory and parsed_args.validate:
-        raise ValueError("Specify either --validate-directory or --validate, not both.")
-    if (
-        parsed_args.validate_directory is not None
-        and not parsed_args.validate_directory.exists()
-    ):
-        raise FileNotFoundError(
-            f"--validate-directory: {parsed_args.validate_directory} does not exist."
-        )
-    if parsed_args.validate_directory:
-        validation_files = list_validation_files(parsed_args.validate_directory)
-        missing_files = [
-            str(x) for x in set(parsed_args.query) - set(validation_files.keys())
-        ]
-
-        if missing_files:
-            raise ValueError(f"Missing files for queries: {','.join(missing_files)}")
-
-    if (
-        parsed_args.output_expected_directory
-        and not parsed_args.validate
-        and parsed_args.engine != "duckdb"
-    ):
-        raise ValueError("Must specify --validate to use --output-expected-directory.")
-
-    if (
-        parsed_args.suffix
-        and not parsed_args.suffix.startswith(".")
-        and not parsed_args.suffix.startswith("/")
-    ):
-        parsed_args.suffix = f".{parsed_args.suffix}"
-
-    return parsed_args
-
-
-def list_validation_files(
-    validate_directory: Path,
-) -> dict[int, Path]:
-    """List the validation files in the given directory."""
-    validation_files: dict[int, Path] = {}
-    for q_path in validate_directory.glob("q*.parquet"):
-        q_id = int(q_path.stem.lstrip("q").lstrip("_"))
-        validation_files[q_id] = q_path
-    return validation_files
-
-
-def validate_result(
-    result: pl.DataFrame,
-    expected: pl.DataFrame,
-    sort_by: list[tuple[str, bool]],
-    limit: int | None = None,
-    sort_keys: list[tuple[pl.Expr, bool]] | None = None,
-    **kwargs: Any,
-) -> ValidationResult:
-    """
-    Validate the computed result against the expected answer.
-
-    This takes care of special handling for validating TPC-H queries,
-    where multiple results might be considered correct.
-
-    See Also
-    --------
-    cudf_polars.testing.asserts.assert_tpch_result_equal
-    """
-    try:
-        assert_tpch_result_equal(
-            result,
-            expected,
-            sort_by=sort_by,
-            limit=limit,
-            sort_keys=sort_keys,
-            **kwargs,
-        )
-    except Exception as e:
-        return ValidationResult.from_error(e)
-    else:
-        return ValidationResult(status="Passed", message=None)
-
-
-@dataclasses.dataclass
-class QueryResult:
-    """
-    Representation of a query's result.
-
-    Parameters
-    ----------
-    frame: pl.LazyFrame
-        The result of the query.
-    sort_by: list[tuple[str, bool]]
-        The columns that the query sorts by. Each tuple contains (column_name, descending_flag).
-        Used for the ties/limit boundary logic in validation.
-    sort_keys: list[tuple[pl.Expr, bool]] | None
-        Optional Polars expressions for the sortedness check. Use this when the query
-        sorts by a conditional expression (e.g. ``CASE WHEN lochierarchy = 0 THEN i_category END``)
-        that cannot be represented as a plain column name in ``sort_by``. When provided,
-        these expressions are evaluated against the output and used only for the sortedness
-        check; ``sort_by`` still drives the ties/limit boundary logic.
-    limit: int | None
-        The limit of the query, if any.
-
-    """
-
-    frame: pl.LazyFrame
-    sort_by: list[tuple[str, bool]]
-    limit: int | None = None
-    nulls_last: bool = True
-    sort_keys: list[tuple[pl.Expr, bool]] | None = None
-
-
-def check_input_data_type(
-    run_config: RunConfig,
-) -> tuple[Literal["decimal", "float"], Literal["date", "timestamp"]]:
-    """
-    Check the input data types columns with variable data types.
-
-    Our queries might be run on datasets that use different data types for different
-    types of columns. Our validation supports:
-
-    1. 'decimal' or 'float' for non-integer numeric columns (e.g. 'c_acctbal')
-    2. 'date' or 'timestamp' for date type columns (e.g. 'o_orderdate')
-
-    For PDS-H, this is determined by the ``c_acctbal`` column in the
-    customer table.  For PDS-DS, we use ``i_current_price`` from the item table.
-    """
-    if run_config.query_set == "pdsds":
-        table, col = "item", "i_current_price"
-    else:
-        table, col = "customer", "c_acctbal"
-    path = f"{run_config.dataset_path}/{table}{run_config.suffix}"
-    t = pl.scan_parquet(path).select(pl.col(col)).collect_schema()[col]
-
-    num_type: Literal["decimal", "float"]
-    date_type: Literal["date", "timestamp"]
-    if t.is_decimal():
-        num_type = "decimal"
-    else:
-        num_type = "float"
-
-    if run_config.query_set == "pdsds":
-        date_type = "date"
-    else:
-        path = f"{run_config.dataset_path}/orders{run_config.suffix}"
-        t = (
-            pl.scan_parquet(path)
-            .select(pl.col("o_orderdate"))
-            .collect_schema()["o_orderdate"]
-        )
-
-        if t.to_python().__name__ == "date":
-            date_type = "date"
-        else:
-            date_type = "timestamp"
-
-    return num_type, date_type
-
-
-def run_polars_query_iteration(
-    q_id: int,
-    iteration: int,
-    q: pl.LazyFrame,
-    run_config: RunConfig,
-    args: argparse.Namespace,
-    engine: pl.GPUEngine | None,
-    expected: pl.DataFrame | None,
-    query_result: Any,
-    client: Any,
-    prepare_validation_result: Callable[[pl.DataFrame], pl.DataFrame] | None = None,
-    result_casts: list[pl.Expr] | None = None,
-) -> SuccessRecord:
-    """Run a single query iteration. Caller must wrap in try/except."""
-    result, duration = execute_query(q_id, iteration, q, run_config, args, engine)
-
-    if expected is not None and prepare_validation_result is not None:
-        result = prepare_validation_result(result)
-
-    if expected is not None and result_casts:
-        # Applying the casts to the polars result is
-        # a workaround we need because of a polars bug
-        # See https://github.com/pola-rs/polars/issues/27269
-        # Once we support polars 1.40, we should remove this
-        result = result.with_columns(*result_casts)
-
-    if run_config.shuffle == "rapidsmpf" and run_config.gather_shuffle_stats:
-        from rapidsmpf.integrations.dask.shuffler import (
-            clear_shuffle_statistics,
-            gather_shuffle_statistics,
-        )
-
-        shuffle_stats = gather_shuffle_statistics(client)
-        clear_shuffle_statistics(client)
-    else:
-        shuffle_stats = None
-
-    if expected is not None:
-        validation_result = validate_result(
-            result,
-            expected,
-            query_result.sort_by,
-            limit=query_result.limit,
-            nulls_last=query_result.nulls_last,
-            sort_keys=query_result.sort_keys,
-            **get_validation_options(args),
-        )
-    else:
-        validation_result = None
-
-    if args.print_results:
-        print(result)
-
-    if args.results_directory is not None and iteration == 0:
-        results_dir = Path(args.results_directory)
-        results_dir.mkdir(parents=True, exist_ok=True)
-        output_path = results_dir / f"q_{q_id:02d}.parquet"
-        result.write_parquet(output_path)
-
-    return SuccessRecord(
-        query=q_id,
-        iteration=iteration,
-        duration=duration,
-        shuffle_stats=shuffle_stats,
-        validation_result=validation_result,
-    )
-
-
-def run_polars_query(
-    q_id: int,
-    benchmark: Any,
-    run_config: RunConfig,
-    args: argparse.Namespace,
-    engine: pl.GPUEngine | None,
-    client: Any,
-    numeric_type: str,
-    date_type: str,
-    validation_files: dict[int, Path] | None,
-    prepare_validation_result: Callable[[pl.DataFrame], pl.DataFrame] | None = None,
-) -> QueryRunResult:
-    """Run all iterations for a single query. Caller must wrap in try/except."""
-    query_result = getattr(benchmark, f"q{q_id}")(run_config)
-    q = query_result.frame
-
-    print_query_plan(q_id, q, args, run_config, engine, print_plans=args.print_plans)
-    plan = None
-    if (args.explain or args.explain_logical) and engine is not None:
-        from cudf_polars.experimental.explain import serialize_query
-
-        plan = serialize_query(q, engine)
-
-    casts = benchmark.EXPECTED_CASTS.get(q_id, [])
-    if numeric_type == "decimal":
-        casts.extend(benchmark.EXPECTED_CASTS_DECIMAL.get(q_id, []))
-    if date_type == "timestamp":
-        casts.extend(benchmark.EXPECTED_CASTS_TIMESTAMP.get(q_id, []))
-
-    expected: pl.DataFrame | None = None
-    if args.validate:
-        if args.baseline == "cpu":
-            expected = q.collect()
-        elif args.baseline == "duckdb":
-            duckdb_queries_cls = benchmark().duckdb_queries
-            get_ddb = getattr(duckdb_queries_cls, f"q{q_id}")
-            base_sql = get_ddb(run_config)
-            expected = execute_duckdb_query(
-                base_sql,
-                run_config.dataset_path,
-                query_set=duckdb_queries_cls.name,
-                suffix=run_config.suffix,
-                run_config=run_config,
-            ).with_columns(*casts)
-        else:
-            raise ValueError(f"Invalid baseline: {args.baseline}")
-    elif validation_files is not None:
-        expected = pl.read_parquet(validation_files[q_id]).with_columns(*casts)
-    else:
-        expected = None
-
-    if args.output_expected_directory is not None:
-        assert expected is not None, (
-            "Expected result must be computed before writing to disk."
-        )
-        expected_dir = Path(args.output_expected_directory)
-        expected_dir.mkdir(parents=True, exist_ok=True)
-        expected.write_parquet(expected_dir / f"q_{q_id:02d}.parquet")
-
-    query_records: list[SuccessRecord | FailedRecord] = []
-    iteration_failures: list[tuple[int, int]] = []
-    validation_failed = False
-    record: SuccessRecord | FailedRecord
-
-    for i in range(args.iterations):
-        if _HAS_STRUCTLOG and run_config.collect_traces:
-            setup_logging(q_id, i)
-            if client is not None:
-                client.run(setup_logging, q_id, i)
-
-        try:
-            record = run_polars_query_iteration(
-                q_id=q_id,
-                iteration=i,
-                q=q,
-                run_config=run_config,
-                args=args,
-                engine=engine,
-                expected=expected,
-                query_result=query_result,
-                client=client,
-                prepare_validation_result=prepare_validation_result,
-                result_casts=casts if casts else None,
-            )
-        except Exception:
-            print(f"❌ query={q_id} iteration={i} failed!")
-            print(traceback.format_exc())
-            iteration_failures.append((q_id, i))
-            record = FailedRecord(
-                query=q_id,
-                iteration=i,
-                status="error",
-                traceback=traceback.format_exc(),
-            )
-
-        else:
-            if record.validation_result and record.validation_result.status == "Failed":
-                validation_failed = True
-                print(
-                    f"❌ Query {q_id} failed validation!\n{record.validation_result.message}"
-                )
-                if record.validation_result.details:
-                    pprint.pprint(record.validation_result.details)
-            else:
-                prefix = "✅ " if record.validation_result else ""
-                print(
-                    f"{prefix}Query {q_id} - Iteration {i} finished in {record.duration:0.4f}s",
-                    flush=True,
-                )
-
-        query_records.append(record)
-
-    return QueryRunResult(
-        query_records=query_records,
-        plan=plan,
-        iteration_failures=iteration_failures,
-        validation_failed=validation_failed,
-    )
-
-
-def _run_query_loop(
-    benchmark: Any,
-    args: argparse.Namespace,
-    run_config: RunConfig,
-    engine: pl.GPUEngine | None,
-    client: Any,
-    numeric_type: str,
-    date_type: str,
-    validation_files: dict[int, Path] | None,
-    prepare_validation_result: Callable[[pl.DataFrame], pl.DataFrame] | None = None,
-) -> tuple[
-    defaultdict[int, list[SuccessRecord | FailedRecord]],
-    dict[int, Any],
-    list[int],
-    list[tuple[int, int]],
-]:
-    """Execute all queries in ``run_config`` and return accumulated results."""
-    records: defaultdict[int, list[SuccessRecord | FailedRecord]] = defaultdict(list)
-    plans: dict[int, SerializablePlan] = {}
-    validation_failures: list[int] = []
-    query_failures: list[tuple[int, int]] = []
-
-    for q_id in run_config.queries:
-        try:
-            result = run_polars_query(
-                q_id=q_id,
-                benchmark=benchmark,
-                run_config=run_config,
-                args=args,
-                engine=engine,
-                client=client,
-                numeric_type=numeric_type,
-                date_type=date_type,
-                validation_files=validation_files,
-                prepare_validation_result=prepare_validation_result,
-            )
-        except Exception:
-            print(f"❌ query={q_id} failed (setup or execution)!")
-            print(traceback.format_exc())
-            query_failures.append((q_id, -1))
-            record = FailedRecord(
-                query=q_id,
-                iteration=-1,
-                traceback=traceback.format_exc(),
-            )
-            result = QueryRunResult(
-                query_records=[record],
-                plan=None,
-                iteration_failures=[],
-                validation_failed=False,
-            )
-
-        records[q_id] = result.query_records
-        if result.plan is not None:
-            plans[q_id] = result.plan
-        query_failures.extend(result.iteration_failures)
-        if result.validation_failed:
-            validation_failures.append(q_id)
-
-    return records, plans, validation_failures, query_failures
-
-
-def _consolidate_logs(run_config: RunConfig, client: Any) -> RunConfig:
-    """Merge structlog traces from the local process and Dask workers into run_config."""
-    if not (_HAS_STRUCTLOG and run_config.collect_traces):
-        return run_config
-
-    def gather_logs() -> str:
-        logger = logging.getLogger()
-        return logger.handlers[0].stream.getvalue()  # type: ignore[attr-defined]
-
-    if client is not None:
-        # Gather logs from both client (for Query Plan) and workers
-        worker_logs = "\n".join(client.run(gather_logs).values())
-        client_logs = gather_logs()
-        all_logs = client_logs + "\n" + worker_logs
-    else:
-        all_logs = gather_logs()
-
-    parsed_logs = [json.loads(log) for log in all_logs.splitlines() if log]
-    # Some other log records can end up in here. Filter those out.
-    scope_values = {s.value for s in Scope}
-    parsed_logs = [log for log in parsed_logs if log.get("scope") in scope_values]
-    # Now we want to augment the existing Records with the trace data.
-
-    def group_key(x: dict) -> int:
-        return x["query_id"]
-
-    def sort_key(x: dict) -> tuple[int, int]:
-        return x["query_id"], x["iteration"]
-
-    grouped = itertools.groupby(
-        sorted(parsed_logs, key=sort_key),
-        key=group_key,
-    )
-
-    for query_id, run_logs_group in grouped:
-        run_logs = list(run_logs_group)
-        by_iteration = [
-            list(x)
-            for _, x in itertools.groupby(run_logs, key=lambda x: x["iteration"])
-        ]
-        run_records = run_config.records[query_id]
-        assert len(by_iteration) == len(run_records)  # same number of iterations
-        all_traces = [list(iteration) for iteration in by_iteration]
-
-        new_records: list[SuccessRecord | FailedRecord] = []
-        for rec, traces in zip(run_records, all_traces, strict=True):
-            if rec.status == "success":
-                new_records.append(dataclasses.replace(rec, traces=traces))
-            else:
-                new_records.append(rec)
-
-        run_config.records[query_id] = new_records
-
-    return run_config
-
-
-def run_polars(
-    benchmark: Any,
-    args: argparse.Namespace,
-) -> None:
-    """Run the queries using the given benchmark and executor options."""
-    vars(args).update({"query_set": benchmark.name})
-    run_config = RunConfig.from_args(args)
-    numeric_type, date_type = check_input_data_type(run_config)
-    validation_files = (
-        list_validation_files(args.validate_directory)
-        if args.validate_directory is not None
-        else None
-    )
-    parquet_options = (
-        {"use_rapidsmpf_native": run_config.native_parquet}
-        if run_config.runtime == "rapidsmpf"
-        else {}
-    )
-    match run_config.cluster:
-        case "single" | "distributed" | None:
-            run_polars_single_or_dask(
-                benchmark,
-                args,
-                run_config,
-                parquet_options,
-                numeric_type,
-                date_type,
-                validation_files,
-            )
-
-
-def run_polars_single_or_dask(
-    benchmark: Any,
-    args: argparse.Namespace,
-    run_config: RunConfig,
-    parquet_options: dict[str, Any],
-    numeric_type: str,
-    date_type: str,
-    validation_files: dict[int, Path] | None,
-) -> None:
-    """Run benchmark queries using Dask or single-process execution."""
-    client = initialize_dask_cluster(run_config, args)
-    if client is not None:
-        run_config = dataclasses.replace(
-            run_config, n_workers=client.scheduler_info()["n_workers"]
-        )
-
-    engine = None
-    if run_config.executor != "cpu":
-        executor_options = get_executor_options(run_config, benchmark=benchmark)
-        engine = pl.GPUEngine(
-            raise_on_fail=True,
-            memory_resource=rmm.mr.CudaAsyncMemoryResource(
-                release_threshold=args.rmm_release_threshold
-            )
-            if run_config.rmm_async
-            else None,
-            cuda_stream_policy=run_config.stream_policy,
-            executor=run_config.executor,
-            executor_options=executor_options,
-            parquet_options=parquet_options,
-        )
-
-    records, plans, validation_failures, query_failures = _run_query_loop(
-        benchmark,
-        args,
-        run_config,
-        engine,
-        client,
-        numeric_type,
-        date_type,
-        validation_files,
-    )
-    run_config = dataclasses.replace(run_config, records=dict(records), plans=plans)
-    run_config = _consolidate_logs(run_config, client=client)
-    if client is not None:
-        client.close(timeout=60)
-
-    if args.summarize:
-        run_config.summarize()
-
-    if args.validate and run_config.executor != "cpu":
-        print("\nValidation Summary")
-        print("==================")
-        if validation_failures:
-            print(
-                f"{len(validation_failures)} queries failed validation: {sorted(set(validation_failures))}"
-            )
-        else:
-            print("✅ All validated queries passed.")
-
-    args.output.write(json.dumps(run_config.serialize(engine=engine)))
-    args.output.write("\n")
-
-    sys.exit(1 if (query_failures or validation_failures) else 0)
-
-
-def setup_logging(query_id: int, iteration: int) -> None:  # noqa: D103
-    import cudf_polars.dsl.tracing
-
-    if not cudf_polars.dsl.tracing.LOG_TRACES:
-        msg = (
-            "Tracing requested via --collect-traces, but tracking is not enabled. "
-            "Verify that 'CUDF_POLARS_LOG_TRACES' is set and structlog is installed."
-        )
-        raise RuntimeError(msg)
-
-    if _HAS_STRUCTLOG:
-        # structlog uses contextvars to propagate context down to where log records
-        # are emitted. Ideally, we'd just set the contextvars here using
-        # structlog.bind_contextvars; for the distributed cluster we would need
-        # to use something like client.run to set the contextvars on the worker.
-        # However, there's an unfortunate conflict between structlog's use of
-        # context vars and how Dask Workers actually execute tasks, such that
-        # the contextvars set via `client.run` aren't visible to the actual
-        # tasks.
-        #
-        # So instead we make a new logger each time we need a new context,
-        # i.e. for each query/iteration pair.
-
-        def make_injector(
-            query_id: int, iteration: int
-        ) -> Callable[[logging.Logger, str, dict[str, Any]], dict[str, Any]]:
-            def inject(
-                logger: Any, method_name: Any, event_dict: Any
-            ) -> dict[str, Any]:
-                event_dict["query_id"] = query_id
-                event_dict["iteration"] = iteration
-                return event_dict
-
-            return inject
-
-        shared_processors = [
-            structlog.contextvars.merge_contextvars,
-            make_injector(query_id, iteration),
-            structlog.processors.add_log_level,
-            structlog.processors.CallsiteParameterAdder(
-                parameters=[
-                    structlog.processors.CallsiteParameter.PROCESS,
-                    structlog.processors.CallsiteParameter.THREAD,
-                ],
-            ),
-            structlog.processors.StackInfoRenderer(),
-            structlog.dev.set_exc_info,
-            structlog.processors.TimeStamper(fmt="%Y-%m-%d %H:%M:%S.%f", utc=False),
-        ]
-
-        # For logging to a file
-        json_renderer = structlog.processors.JSONRenderer()
-
-        stream = io.StringIO()
-        json_file_handler = logging.StreamHandler(stream)
-        json_file_handler.setFormatter(
-            structlog.stdlib.ProcessorFormatter(
-                processor=json_renderer,
-                foreign_pre_chain=shared_processors,
-            )
-        )
-
-        logging.basicConfig(level=logging.INFO, handlers=[json_file_handler])
-
-        structlog.configure(
-            processors=[
-                *shared_processors,
-                structlog.stdlib.ProcessorFormatter.wrap_for_formatter,
-            ],
-            logger_factory=structlog.stdlib.LoggerFactory(),
-            wrapper_class=structlog.make_filtering_bound_logger(logging.INFO),
-            cache_logger_on_first_use=True,
-        )
-
-
-PDSDS_TABLE_NAMES: list[str] = [
-    "call_center",
-    "catalog_page",
-    "catalog_returns",
-    "catalog_sales",
-    "customer",
-    "customer_address",
-    "customer_demographics",
-    "date_dim",
-    "household_demographics",
-    "income_band",
-    "inventory",
-    "item",
-    "promotion",
-    "reason",
-    "ship_mode",
-    "store",
-    "store_returns",
-    "store_sales",
-    "time_dim",
-    "warehouse",
-    "web_page",
-    "web_returns",
-    "web_sales",
-    "web_site",
-]
-
-PDSH_TABLE_NAMES: list[str] = [
-    "customer",
-    "lineitem",
-    "nation",
-    "orders",
-    "part",
-    "partsupp",
-    "region",
-    "supplier",
-]
-
-
-def _make_duckdb_config(run_config: RunConfig | None) -> dict[str, Any]:
-    """Build a DuckDB connection config dict from a RunConfig."""
-    config: dict[str, Any] = {
-        "threads": run_config.duckdb_threads
-        if (run_config and run_config.duckdb_threads is not None)
-        else os.cpu_count(),
-    }
-    if run_config and run_config.duckdb_memory_limit is not None:
-        config["memory_limit"] = run_config.duckdb_memory_limit
-    if run_config and run_config.duckdb_temp_dir is not None:
-        config["temp_directory"] = run_config.duckdb_temp_dir
-    return config
-
-
-def print_duckdb_plan(
-    q_id: int,
-    sql: str,
-    dataset_path: Path,
-    suffix: str,
-    query_set: str,
-    args: argparse.Namespace,
-    run_config: RunConfig | None = None,
-) -> None:
-    """Print DuckDB query plan using EXPLAIN."""
-    if duckdb is None:
-        raise ImportError(duckdb_err)
-
-    if query_set == "pdsds":
-        tbl_names = PDSDS_TABLE_NAMES
-    else:
-        tbl_names = PDSH_TABLE_NAMES
-
-    with duckdb.connect(config=_make_duckdb_config(run_config)) as conn:
-        for name in tbl_names:
-            pattern = f"{dataset_path}/{name}{suffix}"
-            conn.execute(
-                f"CREATE OR REPLACE VIEW {name} AS "
-                f"SELECT * FROM parquet_scan('{pattern}');"
-            )
-
-        if args.explain_logical and args.explain:
-            conn.execute("PRAGMA explain_output = 'all';")
-        elif args.explain_logical:
-            conn.execute("PRAGMA explain_output = 'optimized_only';")
-        else:
-            conn.execute("PRAGMA explain_output = 'physical_only';")
-
-        print(f"\nDuckDB Query {q_id} - Plan\n")
-
-        plan_rows = conn.execute(f"EXPLAIN {sql}").fetchall()
-        for _, line in plan_rows:
-            print(line)
-
-
-def execute_duckdb_query(
-    query: str,
-    dataset_path: Path,
-    *,
-    suffix: str = ".parquet",
-    query_set: str = "pdsh",
-    run_config: RunConfig | None = None,
-) -> pl.DataFrame:
-    """Execute a query with DuckDB."""
-    if duckdb is None:
-        raise ImportError(duckdb_err)
-    if query_set == "pdsds":
-        tbl_names = PDSDS_TABLE_NAMES
-    else:
-        tbl_names = PDSH_TABLE_NAMES
-    with duckdb.connect(config=_make_duckdb_config(run_config)) as conn:
-        for name in tbl_names:
-            pattern = f"{dataset_path}/{name}{suffix}"
-            conn.execute(
-                f"CREATE OR REPLACE VIEW {name} AS "
-                f"SELECT * FROM parquet_scan('{pattern}');"
-            )
-        return conn.execute(query).pl()
-
-
-def run_duckdb(duckdb_queries_cls: Any, args: argparse.Namespace) -> None:
-    """Run the benchmark with DuckDB."""
-    vars(args).update({"query_set": duckdb_queries_cls.name})
-    run_config = RunConfig.from_args(args)
-    records: defaultdict[int, list[SuccessRecord | FailedRecord]] = defaultdict(list)
-
-    for q_id in run_config.queries:
-        try:
-            get_q = getattr(duckdb_queries_cls, f"q{q_id}")
-        except AttributeError as err:
-            raise NotImplementedError(f"Query {q_id} not implemented.") from err
-
-        sql = get_q(run_config)
-
-        if args.explain or args.explain_logical:
-            print_duckdb_plan(
-                q_id=q_id,
-                sql=sql,
-                dataset_path=run_config.dataset_path,
-                suffix=run_config.suffix,
-                query_set=duckdb_queries_cls.name,
-                args=args,
-                run_config=run_config,
-            )
-
-        print(f"DuckDB Executing: {q_id}")
-        records[q_id] = []
-
-        for i in range(args.iterations):
-            if run_config.io_mode == "cold":
-                drop_file_page_cache_recursively(run_config.dataset_path)
-            t0 = time.time()
-            result = execute_duckdb_query(
-                sql,
-                run_config.dataset_path,
-                suffix=run_config.suffix,
-                query_set=duckdb_queries_cls.name,
-                run_config=run_config,
-            )
-            t1 = time.time()
-            record = SuccessRecord(query=q_id, iteration=i, duration=t1 - t0)
-            if args.print_results:
-                print(result)
-            print(f"Query {q_id} - Iteration {i} finished in {record.duration:0.4f}s")
-            records[q_id].append(record)
-            if i == 0 and args.output_expected_directory is not None:
-                expected_dir = Path(args.output_expected_directory)
-                expected_dir.mkdir(parents=True, exist_ok=True)
-                result.write_parquet(expected_dir / f"q_{q_id:02d}.parquet")
-
-    run_config = dataclasses.replace(run_config, records=dict(records))
-    if args.summarize:
-        run_config.summarize()
-
-    args.output.write(json.dumps(run_config.serialize(engine=None)))
-    args.output.write("\n")
diff --git a/python/cudf_polars/cudf_polars/experimental/dask_registers.py b/python/cudf_polars/cudf_polars/experimental/dask_registers.py
deleted file mode 100644
index 94334ccbc57..00000000000
--- a/python/cudf_polars/cudf_polars/experimental/dask_registers.py
+++ /dev/null
@@ -1,227 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Dask function registrations such as serializers and dispatch implementations."""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any, ClassVar, overload
-
-from dask.sizeof import sizeof as sizeof_dispatch
-from dask.tokenize import normalize_token
-from distributed.protocol import dask_deserialize, dask_serialize
-from distributed.protocol.cuda import cuda_deserialize, cuda_serialize
-from distributed.utils import log_errors
-
-import pylibcudf as plc
-import rmm
-
-from cudf_polars.containers import Column, DataFrame, DataType
-from cudf_polars.dsl.expressions.base import NamedExpr
-from cudf_polars.utils.cuda_stream import get_dask_cuda_stream
-
-if TYPE_CHECKING:
-    from collections.abc import Hashable, Mapping
-
-    from distributed import Client
-
-    from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-    from rmm.pylibrmm.stream import Stream
-
-    from cudf_polars.typing import ColumnHeader, ColumnOptions, DataFrameHeader
-
-
-__all__ = ["DaskRegisterManager", "register"]
-
-
-class DaskRegisterManager:  # pragma: no cover; Only used with Distributed cluster
-    """Manager to ensure ensure serializer is only registered once."""
-
-    _registered: bool = False
-    _client_run_executed: ClassVar[set[str]] = set()
-
-    @classmethod
-    def register_once(cls) -> None:
-        """Register Dask/cudf-polars serializers in calling process."""
-        if not cls._registered:
-            from cudf_polars.experimental.dask_registers import register
-
-            register()
-            cls._registered = True
-
-    @classmethod
-    def run_on_cluster(cls, client: Client) -> None:
-        """Run register on the workers and scheduler once."""
-        if client.id not in cls._client_run_executed:
-            client.run(cls.register_once)
-            client.run_on_scheduler(cls.register_once)
-            cls._client_run_executed.add(client.id)
-
-
-def register() -> None:
-    """Register dask serialization and dispatch functions."""
-
-    @overload
-    def serialize_column_or_frame(
-        x: DataFrame,
-    ) -> tuple[DataFrameHeader, list[memoryview]]: ...
-
-    @overload
-    def serialize_column_or_frame(
-        x: Column,
-    ) -> tuple[ColumnHeader, list[memoryview]]: ...
-
-    @cuda_serialize.register((Column, DataFrame))
-    def serialize_column_or_frame(
-        x: DataFrame | Column,
-    ) -> tuple[
-        DataFrameHeader | ColumnHeader, list[memoryview[bytes] | plc.gpumemoryview]
-    ]:
-        with log_errors():
-            header, frames = x.serialize(stream=get_dask_cuda_stream())
-            # Dask expect a list of frames
-            return header, list(frames)
-
-    @cuda_deserialize.register(DataFrame)
-    def _(
-        header: DataFrameHeader, frames: tuple[memoryview[bytes], plc.gpumemoryview]
-    ) -> DataFrame:
-        with log_errors():
-            metadata, gpudata = frames  # TODO: check if this is a length-2 list...
-            return DataFrame.deserialize(
-                header,
-                (metadata, plc.gpumemoryview(gpudata)),
-                stream=get_dask_cuda_stream(),
-            )
-
-    @cuda_deserialize.register(Column)
-    def _(
-        header: ColumnHeader, frames: tuple[memoryview[bytes], plc.gpumemoryview]
-    ) -> Column:
-        with log_errors():
-            metadata, gpudata = frames
-            return Column.deserialize(
-                header,
-                (metadata, plc.gpumemoryview(gpudata)),
-                stream=get_dask_cuda_stream(),
-            )
-
-    @overload
-    def dask_serialize_column_or_frame(
-        x: DataFrame,
-    ) -> tuple[DataFrameHeader, tuple[memoryview[bytes], memoryview[bytes]]]: ...
-
-    @overload
-    def dask_serialize_column_or_frame(
-        x: Column,
-    ) -> tuple[ColumnHeader, tuple[memoryview[bytes], memoryview[bytes]]]: ...
-
-    @dask_serialize.register(Column)
-    def dask_serialize_column_or_frame(
-        x: DataFrame | Column,
-    ) -> tuple[
-        DataFrameHeader | ColumnHeader, tuple[memoryview[bytes], memoryview[bytes]]
-    ]:
-        stream = get_dask_cuda_stream()
-        with log_errors():
-            header, (metadata, gpudata) = x.serialize(stream=stream)
-
-            # For robustness, we check that the gpu data is contiguous
-            cai = gpudata.__cuda_array_interface__
-            assert len(cai["shape"]) == 1
-            assert cai["strides"] is None or cai["strides"] == (1,)
-            assert cai["typestr"] == "|u1"
-            nbytes = cai["shape"][0]
-
-            # Copy the gpudata to host memory
-            gpudata_on_host: memoryview[bytes] = memoryview(
-                rmm.DeviceBuffer(ptr=gpudata.ptr, size=nbytes).copy_to_host()
-            )
-            return header, (metadata, gpudata_on_host)
-
-    @dask_deserialize.register(Column)
-    def _(header: ColumnHeader, frames: tuple[memoryview[bytes], memoryview]) -> Column:
-        with log_errors():
-            assert len(frames) == 2
-            # Copy the second frame (the gpudata in host memory) back to the gpu
-            new_frames = (
-                frames[0],
-                plc.gpumemoryview(rmm.DeviceBuffer.to_device(frames[1])),
-            )
-            return Column.deserialize(header, new_frames, stream=get_dask_cuda_stream())
-
-    @dask_serialize.register(DataFrame)
-    def _(
-        x: DataFrame, context: Mapping[str, Any] | None = None
-    ) -> tuple[DataFrameHeader, tuple[memoryview[bytes], memoryview[bytes]]]:
-        # Do regular serialization if no staging buffer is provided.
-        if context is None or "staging_device_buffer" not in context:
-            return dask_serialize_column_or_frame(x)
-
-        # If a staging buffer is provided, we use `ChunkedPack` to
-        # serialize the dataframe using the provided staging buffer.
-        with log_errors():
-            # Keyword arguments for `Column.__init__`.
-            columns_kwargs: list[ColumnOptions] = [
-                col.serialize_ctor_kwargs() for col in x.columns
-            ]
-            header: DataFrameHeader = {
-                "columns_kwargs": columns_kwargs,
-                "frame_count": 2,
-            }
-            if "stream" not in context:
-                raise ValueError(
-                    "context: stream must be given when staging_device_buffer is"
-                )
-            if "device_mr" not in context:
-                raise ValueError(
-                    "context: device_mr must be given when staging_device_buffer is"
-                )
-            stream: Stream = context["stream"]
-            device_mr: DeviceMemoryResource = context["device_mr"]
-            buf: rmm.DeviceBuffer = context["staging_device_buffer"]
-            frame = plc.contiguous_split.ChunkedPack.create(
-                x.table, buf.nbytes, stream, device_mr
-            ).pack_to_host(buf)
-            return header, frame
-
-    @dask_deserialize.register(DataFrame)
-    def _(
-        header: DataFrameHeader, frames: tuple[memoryview[bytes], memoryview]
-    ) -> DataFrame:
-        with log_errors():
-            assert len(frames) == 2
-            # Copy the second frame (the gpudata in host memory) back to the gpu
-            new_frames = (
-                frames[0],
-                plc.gpumemoryview(rmm.DeviceBuffer.to_device(frames[1])),
-            )
-            return DataFrame.deserialize(
-                header, new_frames, stream=get_dask_cuda_stream()
-            )
-
-    @sizeof_dispatch.register(Column)
-    def _(x: Column) -> int:
-        """The total size of the device buffers used by the DataFrame or Column."""
-        return x.obj.device_buffer_size()
-
-    @sizeof_dispatch.register(DataFrame)
-    def _(x: DataFrame) -> int:
-        """The total size of the device buffers used by the DataFrame or Column."""
-        return sum(c.obj.device_buffer_size() for c in x.columns)
-
-    # Register rapidsmpf serializer if it's installed.
-    try:
-        from rapidsmpf.integrations.dask.spilling import register_dask_serialize
-
-        register_dask_serialize()  # pragma: no cover; rapidsmpf dependency not included yet
-    except ImportError:  # pragma: no cover
-        pass
-
-    # Register the tokenizer for NamedExpr and DataType. This is a performance
-    # optimization that speeds up tokenization for the most common types seen in
-    # the Dask task graph.
-    @normalize_token.register(NamedExpr)
-    @normalize_token.register(DataType)
-    def _(x: NamedExpr | DataType) -> Hashable:
-        return hash(x)
diff --git a/python/cudf_polars/cudf_polars/experimental/io.py b/python/cudf_polars/cudf_polars/experimental/io.py
index 804bcfe2040..f45baa054dd 100644
--- a/python/cudf_polars/cudf_polars/experimental/io.py
+++ b/python/cudf_polars/cudf_polars/experimental/io.py
@@ -410,17 +410,17 @@ def _(
     if (
         Path(ir.path).exists()
         and executor_options.sink_to_directory
-        and executor_options.cluster in (Cluster.SINGLE, Cluster.DISTRIBUTED)
+        and executor_options.cluster == Cluster.SINGLE
     ):
-        # This lowering-time check can't be performed with the new spmd / ray / dask
+        # This lowering-time check can't be performed with the spmd / ray / dask
         # clusters, which lower on each worker independently. There's a race condition
         # between each worker performing this check that the path doesn't yet exist,
         # and the sink operation creating the directory at the start of execution.
         raise NotImplementedError(
-            f"Trying to sink to an existing directory: {ir.path}."
+            f"Trying to sink to an existing directory: {ir.path}. "
             "Writing to an existing path is not supported when sinking "
-            "to a directory. If you are using the 'distributed' scheduler, "
-            "please remove the target directory before calling 'collect'. "
+            "to a directory. Please remove the target directory before "
+            "calling 'collect'."
         )
 
     sink_to_directory = executor_options.sink_to_directory
diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py
index e9104b5e074..108d7822d60 100644
--- a/python/cudf_polars/cudf_polars/experimental/parallel.py
+++ b/python/cudf_polars/cudf_polars/experimental/parallel.py
@@ -109,7 +109,6 @@ def lower_ir_graph(
 def task_graph(
     ir: IR,
     partition_info: MutableMapping[IR, PartitionInfo],
-    config_options: ConfigOptions,
 ) -> tuple[MutableMapping[Any, Any], str | tuple[str, int]]:
     """
     Construct a task graph for evaluation of an IR graph.
@@ -121,16 +120,13 @@ def task_graph(
     partition_info
         A mapping from all unique IR nodes to the
         associated partitioning information.
-    config_options
-        GPUEngine configuration options.
-    context
-        Runtime context for IR node execution.
 
     Returns
     -------
     graph
-        A Dask-compatible task graph for the entire
-        IR graph with root `ir`.
+        A task graph for the entire IR graph with root `ir`,
+        in dict-of-tuples form consumed by
+        :func:`~cudf_polars.experimental.scheduler.synchronous_scheduler`.
 
     Notes
     -----
@@ -138,9 +134,6 @@ def task_graph(
     graph with root `ir`, and extracts the tasks for
     each node with :func:`generate_ir_tasks`.
 
-    The output is passed into :func:`post_process_task_graph` to
-    add any additional processing that is specific to the executor.
-
     See Also
     --------
     generate_ir_tasks
@@ -167,67 +160,9 @@ def task_graph(
     else:
         key = (key_name, 0)
 
-    graph = post_process_task_graph(graph, key, config_options)
     return graph, key
 
 
-# The true type signature for get_scheduler() needs an overload. Not worth it.
-
-
-def get_scheduler(config_options: ConfigOptions[StreamingExecutor]) -> Any:
-    """Get appropriate task scheduler."""
-    cluster = config_options.executor.cluster
-
-    if (
-        cluster == "distributed"
-    ):  # pragma: no cover; block depends on executor type and Distributed cluster
-        from distributed import get_client
-
-        from cudf_polars.experimental.dask_registers import DaskRegisterManager
-
-        client = get_client()
-        DaskRegisterManager.register_once()
-        DaskRegisterManager.run_on_cluster(client)
-        return client.get
-    elif cluster == "single":
-        from cudf_polars.experimental.scheduler import synchronous_scheduler
-
-        return synchronous_scheduler
-    else:  # pragma: no cover
-        raise ValueError(f"{cluster} not a supported cluster option.")
-
-
-def post_process_task_graph(
-    graph: MutableMapping[Any, Any],
-    key: str | tuple[str, int],
-    config_options: ConfigOptions[StreamingExecutor],
-) -> MutableMapping[Any, Any]:
-    """
-    Post-process the task graph.
-
-    Parameters
-    ----------
-    graph
-        Task graph to post-process.
-    key
-        Output key for the graph.
-    config_options
-        GPUEngine configuration options.
-
-    Returns
-    -------
-    graph
-        A Dask-compatible task graph.
-    """
-    if config_options.executor.rapidsmpf_spill:  # pragma: no cover
-        from cudf_polars.experimental.spilling import wrap_dataframe_in_spillable
-
-        return wrap_dataframe_in_spillable(
-            graph, ignore_key=key, config_options=config_options
-        )
-    return graph
-
-
 def evaluate_rapidsmpf(
     ir: IR,
     config_options: ConfigOptions[StreamingExecutor],
@@ -280,12 +215,14 @@ def evaluate_streaming(
         return evaluate_rapidsmpf(ir, config_options)
     else:
         # Using the default task engine.
+        from cudf_polars.experimental.scheduler import synchronous_scheduler
+
         stats = collect_statistics(ir, config_options)
         ir, partition_info = lower_ir_graph(ir, config_options, stats)
 
-        graph, key = task_graph(ir, partition_info, config_options)
+        graph, key = task_graph(ir, partition_info)
 
-        return get_scheduler(config_options)(graph, key).to_polars()
+        return synchronous_scheduler(graph, key).to_polars()
 
 
 @generate_ir_tasks.register(IR)
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/core.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/core.py
index 57b916a6d9f..478c0a33beb 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/core.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/core.py
@@ -45,7 +45,6 @@
     Union,
 )
 from cudf_polars.dsl.traversal import CachingVisitor, traversal
-from cudf_polars.experimental.base import PartitionInfo
 from cudf_polars.experimental.parallel import lower_ir_graph
 from cudf_polars.experimental.rapidsmpf.collectives import ReserveOpIDs
 from cudf_polars.experimental.rapidsmpf.dispatch import FanoutInfo
@@ -55,7 +54,6 @@
 )
 from cudf_polars.experimental.rapidsmpf.tracing import log_query_plan
 from cudf_polars.experimental.rapidsmpf.utils import empty_table_chunk
-from cudf_polars.experimental.repartition import Repartition
 from cudf_polars.experimental.statistics import collect_statistics
 from cudf_polars.utils.config import CUDAStreamPoolConfig
 
@@ -70,7 +68,7 @@
     import polars as pl
 
     from cudf_polars.dsl.ir import IR
-    from cudf_polars.experimental.base import StatsCollector
+    from cudf_polars.experimental.base import PartitionInfo, StatsCollector
     from cudf_polars.experimental.parallel import ConfigOptions
     from cudf_polars.experimental.rapidsmpf.dispatch import (
         GenState,
@@ -109,33 +107,6 @@ def evaluate_logical_plan(
         cudf_polars_query_id=str(query_id),
     ):
         match config_options.executor.cluster:
-            case "distributed":  # pragma: no cover; block depends on executor type and Distributed cluster
-                # Legacy distributed execution: lower on the client,
-                # ship the lowered plan to workers.
-                from cudf_polars.experimental.rapidsmpf.dask import (
-                    evaluate_pipeline_dask,
-                )
-
-                stats = collect_statistics(ir, config_options)
-                ir, partition_info = lower_ir_graph(ir, config_options, stats)
-
-                # Dask may return chunks in arbitrary order.
-                if not isinstance(ir, Repartition):
-                    ir = Repartition(ir.schema, ir)
-                    partition_info[ir] = PartitionInfo(count=1)
-
-                with ReserveOpIDs(ir, config_options) as collective_id_map:
-                    log_query_plan(ir, config_options)
-                    result, metadata_collector = evaluate_pipeline_dask(
-                        evaluate_pipeline,
-                        ir,
-                        partition_info,
-                        config_options,
-                        stats,
-                        collective_id_map,
-                        collect_metadata=collect_metadata,
-                        query_id=query_id,
-                    )
             case "spmd":
                 from cudf_polars.experimental.rapidsmpf.frontend.spmd import (
                     evaluate_pipeline_spmd_mode,
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/dask.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/dask.py
deleted file mode 100644
index f13c488d5ea..00000000000
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/dask.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-"""Dask-based execution with the streaming RapidsMPF runtime."""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any, Protocol
-
-from distributed import get_client
-from rapidsmpf.config import Options, get_environment_variables
-from rapidsmpf.integrations.dask import bootstrap_dask_cluster, get_worker_context
-from rapidsmpf.streaming.core.context import Context
-
-import polars as pl
-
-import cudf_polars.dsl.tracing
-from cudf_polars.experimental.dask_registers import DaskRegisterManager
-
-if TYPE_CHECKING:
-    import uuid
-    from collections.abc import MutableMapping
-
-    from distributed import Client
-    from rapidsmpf.communicator.communicator import Communicator
-    from rapidsmpf.streaming.cudf.channel_metadata import ChannelMetadata
-
-    from cudf_polars.dsl.ir import IR
-    from cudf_polars.experimental.base import PartitionInfo, StatsCollector
-    from cudf_polars.experimental.parallel import ConfigOptions, StreamingExecutor
-
-
-class EvaluatePipelineCallback(Protocol):
-    """Protocol for the evaluate_pipeline callback."""
-
-    def __call__(
-        self,
-        ir: IR,
-        partition_info: MutableMapping[IR, PartitionInfo],
-        config_options: ConfigOptions[StreamingExecutor],
-        stats: StatsCollector,
-        collective_id_map: dict[IR, list[int]],
-        comm: Communicator,
-        rmpf_context: Context | None = None,
-        *,
-        collect_metadata: bool = False,
-        query_id: uuid.UUID,
-    ) -> tuple[pl.DataFrame, list[ChannelMetadata] | None]:
-        """Evaluate a pipeline and return the result DataFrame and metadata."""
-        ...
-
-
-def get_dask_client() -> Client:
-    """Get a distributed Dask client."""
-    client = get_client()
-    DaskRegisterManager.register_once()
-    DaskRegisterManager.run_on_cluster(client)
-    return client
-
-
-def evaluate_pipeline_dask(
-    callback: EvaluatePipelineCallback,
-    ir: IR,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    config_options: ConfigOptions,
-    stats: StatsCollector,
-    collective_id_map: dict[IR, list[int]],
-    *,
-    collect_metadata: bool = False,
-    query_id: uuid.UUID,
-) -> tuple[pl.DataFrame, list[ChannelMetadata] | None]:
-    """
-    Evaluate a RapidsMPF streaming pipeline on a Dask cluster.
-
-    Parameters
-    ----------
-    callback
-        The callback function to evaluate the pipeline.
-    ir
-        The IR node.
-    partition_info
-        The partition information.
-    config_options
-        The configuration options.
-    stats
-        The statistics collector.
-    collective_id_map
-        Mapping from Shuffle/Repartition/Join IR nodes to reserved collective IDs.
-    collect_metadata
-        Whether to collect metadata.
-    query_id
-        A unique identifier for the query.
-
-    Returns
-    -------
-    The output DataFrame and metadata collector.
-    """
-    client = get_dask_client()
-
-    # Make sure the cluster is bootstrapped.
-    # This is a no-op if the cluster is already bootstrapped.
-    # TODO: We can apply configuration options here. However, these
-    # options will be ignored if the cluster is already bootstrapped.
-    bootstrap_dask_cluster(client)
-
-    result = client.run(
-        _evaluate_pipeline_dask,
-        callback,
-        ir,
-        partition_info,
-        config_options,
-        stats,
-        collective_id_map,
-        collect_metadata=collect_metadata,
-        query_id=query_id,
-    )
-    dfs: list[pl.DataFrame] = []
-    metadata_collector: list[ChannelMetadata] = []
-    for df, md in result.values():
-        dfs.append(df)
-        if md is not None:
-            metadata_collector.extend(md)
-
-    return pl.concat(dfs), metadata_collector or None
-
-
-def _evaluate_pipeline_dask(
-    callback: EvaluatePipelineCallback,
-    ir: IR,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    config_options: ConfigOptions[StreamingExecutor],
-    stats: StatsCollector,
-    collective_id_map: dict[IR, list[int]],
-    dask_worker: Any = None,
-    *,
-    collect_metadata: bool = False,
-    query_id: uuid.UUID,
-) -> tuple[pl.DataFrame, list[ChannelMetadata] | None]:
-    """
-    Build and evaluate a RapidsMPF streaming pipeline.
-
-    Parameters
-    ----------
-    callback
-        The callback function to evaluate the pipeline.
-    ir
-        The IR node.
-    partition_info
-        The partition information.
-    config_options
-        The configuration options.
-    stats
-        The statistics collector.
-    collective_id_map
-        Mapping from Shuffle/Repartition/Join IR nodes to reserved collective IDs.
-    dask_worker
-        Dask worker reference.
-        This kwarg is automatically populated by Dask
-        when evaluate_pipeline is called with `client.run`.
-    collect_metadata
-        Whether to collect metadata.
-    query_id
-        A unique identifier for the query.
-
-    Returns
-    -------
-    The output DataFrame and metadata collector.
-    """
-    assert dask_worker is not None, "Dask worker must be provided"
-
-    # NOTE: The Dask-CUDA cluster must be bootstrapped
-    # ahead of time using bootstrap_dask_cluster
-    # (rapidsmpf.integrations.dask.bootstrap_dask_cluster)
-    options = Options(
-        {"num_streaming_threads": str(max(config_options.executor.max_io_threads, 1))}
-        | get_environment_variables()
-    )
-    dask_context = get_worker_context(dask_worker)
-    assert dask_context.comm is not None
-    with (
-        Context(dask_context.comm.logger, dask_context.br, options) as rmpf_context,
-        cudf_polars.dsl.tracing.bound_contextvars(query_id=str(query_id)),
-    ):
-        # IDs are already reserved by the caller, just pass them through
-        return callback(
-            ir,
-            partition_info,
-            config_options,
-            stats,
-            collective_id_map,
-            dask_context.comm,
-            rmpf_context,
-            collect_metadata=collect_metadata,
-            query_id=query_id,
-        )
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/io.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/io.py
index 985567621d6..d5005441910 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/io.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/io.py
@@ -185,7 +185,7 @@ async def dataframescan_node(
     distributed_scan
         If ``True``, the DataFrame is treated as a shared object and divided
         across workers so each rank reads a disjoint subset. This is normally
-        used in ``Cluster.DISTRIBUTED`` mode.
+        used in ``Cluster.RAY`` and ``Cluster.DASK`` modes.
 
         If ``False``, the DataFrame is treated as rank-local and each rank
         scans its local DataFrame in full. This is normally used in
diff --git a/python/cudf_polars/cudf_polars/experimental/shuffle.py b/python/cudf_polars/cudf_polars/experimental/shuffle.py
index efb134abf7c..8e24dd83fe6 100644
--- a/python/cudf_polars/cudf_polars/experimental/shuffle.py
+++ b/python/cudf_polars/cudf_polars/experimental/shuffle.py
@@ -6,7 +6,7 @@
 
 import operator
 from functools import partial
-from typing import TYPE_CHECKING, Any, Concatenate, Literal, TypeVar, TypedDict
+from typing import TYPE_CHECKING, Any, Concatenate, TypeVar, TypedDict
 
 import pylibcudf as plc
 from rmm.pylibrmm.stream import DEFAULT_STREAM
@@ -42,7 +42,6 @@ class ShuffleOptions(TypedDict):
     on: Sequence[str]
     column_names: Sequence[str]
     dtypes: Sequence[DataType]
-    cluster_kind: Literal["dask", "single"]
 
 
 # Experimental rapidsmpf shuffler integration
@@ -61,12 +60,7 @@ def insert_partition(
     ) -> None:
         """Add cudf-polars DataFrame chunks to an RMP shuffler."""
         from rapidsmpf.integrations.cudf.partition import partition_and_pack
-
-        if options["cluster_kind"] == "dask":
-            from rapidsmpf.integrations.dask import get_worker_context
-
-        else:
-            from rapidsmpf.integrations.single import get_worker_context
+        from rapidsmpf.integrations.single import get_worker_context
 
         context = get_worker_context()
 
@@ -95,12 +89,7 @@ def extract_partition(
             unpack_and_concat,
             unspill_partitions,
         )
-
-        if options["cluster_kind"] == "dask":
-            from rapidsmpf.integrations.dask import get_worker_context
-
-        else:
-            from rapidsmpf.integrations.single import get_worker_context
+        from rapidsmpf.integrations.single import get_worker_context
 
         context = get_worker_context()
 
@@ -329,45 +318,27 @@ def _(
     shuffle_method = ir.shuffle_method
 
     # Try using rapidsmpf shuffler if we have "simple" shuffle
-    # keys, and the "shuffle_method" config is set to "rapidsmpf"
+    # keys, and the "shuffle_method" config is set to "rapidsmpf-single".
     _keys: list[Col]
-    if shuffle_method in ("rapidsmpf", "rapidsmpf-single") and len(
+    if shuffle_method == "rapidsmpf-single" and len(
         _keys := [ne.value for ne in ir.keys if isinstance(ne.value, Col)]
     ) == len(ir.keys):  # pragma: no cover
-        cluster_kind: Literal["dask", "single"]
-        if shuffle_method == "rapidsmpf-single":
-            from rapidsmpf.integrations.single import rapidsmpf_shuffle_graph
-
-            cluster_kind = "single"
-        else:
-            from rapidsmpf.integrations.dask import rapidsmpf_shuffle_graph
-
-            cluster_kind = "dask"
+        from rapidsmpf.integrations.single import rapidsmpf_shuffle_graph
 
         shuffle_on = [k.name for k in _keys]
 
-        try:
-            return rapidsmpf_shuffle_graph(
-                get_key_name(ir.children[0]),
-                get_key_name(ir),
-                partition_info[ir.children[0]].count,
-                partition_info[ir].count,
-                RMPFIntegration,
-                {
-                    "on": shuffle_on,
-                    "column_names": list(ir.schema.keys()),
-                    "dtypes": list(ir.schema.values()),
-                    "cluster_kind": cluster_kind,
-                },
-            )
-        except ValueError as err:
-            # ValueError: rapidsmpf couldn't find a distributed client
-            if shuffle_method == "rapidsmpf":
-                # Only raise an error if the user specifically
-                # set the shuffle method to "rapidsmpf"
-                raise ValueError(
-                    "The current Dask cluster does not support rapidsmpf shuffling."
-                ) from err
+        return rapidsmpf_shuffle_graph(
+            get_key_name(ir.children[0]),
+            get_key_name(ir),
+            partition_info[ir.children[0]].count,
+            partition_info[ir].count,
+            RMPFIntegration,
+            {
+                "on": shuffle_on,
+                "column_names": list(ir.schema.keys()),
+                "dtypes": list(ir.schema.values()),
+            },
+        )
 
     # Simple task-based fall-back
     return partial(_simple_shuffle_graph, context=context)(
diff --git a/python/cudf_polars/cudf_polars/experimental/sort.py b/python/cudf_polars/cudf_polars/experimental/sort.py
index 47105add263..6800fb4ab74 100644
--- a/python/cudf_polars/cudf_polars/experimental/sort.py
+++ b/python/cudf_polars/cudf_polars/experimental/sort.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 from functools import partial
-from typing import TYPE_CHECKING, Any, Literal, TypedDict
+from typing import TYPE_CHECKING, Any, TypedDict
 
 import polars as pl
 
@@ -295,7 +295,6 @@ class SortedShuffleOptions(TypedDict):
     null_order: Sequence[plc.types.NullOrder]
     column_names: Sequence[str]
     column_dtypes: Sequence[DataType]
-    cluster_kind: Literal["dask", "single"]
 
 
 # Experimental rapidsmpf shuffler integration
@@ -313,12 +312,7 @@ def insert_partition(
     ) -> None:
         """Add cudf-polars DataFrame chunks to an RMP shuffler."""
         from rapidsmpf.integrations.cudf.partition import split_and_pack
-
-        if options["cluster_kind"] == "dask":
-            from rapidsmpf.integrations.dask import get_worker_context
-
-        else:
-            from rapidsmpf.integrations.single import get_worker_context
+        from rapidsmpf.integrations.single import get_worker_context
 
         context = get_worker_context()
 
@@ -360,12 +354,7 @@ def extract_partition(
             unpack_and_concat,
             unspill_partitions,
         )
-
-        if options["cluster_kind"] == "dask":
-            from rapidsmpf.integrations.dask import get_worker_context
-
-        else:
-            from rapidsmpf.integrations.single import get_worker_context
+        from rapidsmpf.integrations.single import get_worker_context
 
         context = get_worker_context()
 
@@ -646,41 +635,23 @@ def _(
     }
 
     # Try using rapidsmpf shuffler if we have "simple" shuffle
-    # keys, and the "shuffle_method" config is set to "rapidsmpf"
+    # keys, and the "shuffle_method" config is set to "rapidsmpf-single".
     shuffle_method = ir.shuffle_method
-    if shuffle_method in ("rapidsmpf", "rapidsmpf-single"):  # pragma: no cover
-        try:
-            if shuffle_method == "rapidsmpf-single":
-                from rapidsmpf.integrations.single import rapidsmpf_shuffle_graph
-
-                options["cluster_kind"] = "single"
-            else:
-                from rapidsmpf.integrations.dask import rapidsmpf_shuffle_graph
-
-                options["cluster_kind"] = "dask"
-            graph.update(
-                rapidsmpf_shuffle_graph(
-                    get_key_name(child),
-                    get_key_name(ir),
-                    partition_info[child].count,
-                    partition_info[ir].count,
-                    RMPFIntegrationSortedShuffle,
-                    options,
-                    sort_boundaries_name,
-                )
+    if shuffle_method == "rapidsmpf-single":  # pragma: no cover
+        from rapidsmpf.integrations.single import rapidsmpf_shuffle_graph
+
+        graph.update(
+            rapidsmpf_shuffle_graph(
+                get_key_name(child),
+                get_key_name(ir),
+                partition_info[child].count,
+                partition_info[ir].count,
+                RMPFIntegrationSortedShuffle,
+                options,
+                sort_boundaries_name,
             )
-        except (ImportError, ValueError) as err:
-            # ImportError: rapidsmpf is not installed
-            # ValueError: rapidsmpf couldn't find a distributed client
-            if shuffle_method == "rapidsmpf":  # pragma: no cover
-                # Only raise an error if the user specifically
-                # set the shuffle method to "rapidsmpf"
-                raise ValueError(
-                    "Rapidsmpf is not installed correctly or the current "
-                    "Dask cluster does not support rapidsmpf shuffling."
-                ) from err
-        else:
-            return graph
+        )
+        return graph
 
     # Simple task-based fall-back
     graph.update(
diff --git a/python/cudf_polars/cudf_polars/experimental/spilling.py b/python/cudf_polars/cudf_polars/experimental/spilling.py
deleted file mode 100644
index 6a5f73a68ed..00000000000
--- a/python/cudf_polars/cudf_polars/experimental/spilling.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-"""Spilling in multi-partition Dask execution using RAPIDSMPF."""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING, Any
-
-from dask.sizeof import sizeof
-from distributed import get_worker
-from rapidsmpf.integrations.dask.core import get_worker_context
-from rapidsmpf.integrations.dask.spilling import SpillableWrapper
-from rapidsmpf.memory.buffer import MemoryType
-
-from cudf_polars.containers import DataFrame
-
-if TYPE_CHECKING:
-    from collections.abc import Callable, MutableMapping
-    from typing import Any
-
-    from cudf_polars.utils.config import ConfigOptions, StreamingExecutor
-
-
-def wrap_arg(obj: Any) -> Any:
-    """
-    Make `obj` spillable if it is a DataFrame.
-
-    Parameters
-    ----------
-    obj
-        The object to be wrapped (if it is a DataFrame).
-
-    Returns
-    -------
-    A SpillableWrapper if obj is a DataFrame, otherwise the original object.
-    """
-    if isinstance(obj, DataFrame):
-        return SpillableWrapper(on_device=obj)
-    return obj
-
-
-def unwrap_arg(obj: Any) -> Any:
-    """
-    Unwraps a SpillableWrapper to retrieve the original object.
-
-    Parameters
-    ----------
-    obj
-        The object to be unwrapped.
-
-    Returns
-    -------
-    The unwrapped obj is a SpillableWrapper, otherwise the original object.
-    """
-    if isinstance(obj, SpillableWrapper):
-        return obj.unspill()
-    return obj
-
-
-def wrap_func_spillable(
-    func: Callable,
-    *,
-    make_func_output_spillable: bool,
-    target_partition_size: int,
-) -> Callable:
-    """
-    Wraps a function to handle spillable DataFrames.
-
-    Parameters
-    ----------
-    func
-        The function to be wrapped.
-    make_func_output_spillable
-        Whether to wrap the function's output in a SpillableWrapper.
-    target_partition_size
-        Target byte size for IO tasks.
-
-    Returns
-    -------
-    A wrapped function that processes spillable DataFrames.
-    """
-
-    def wrapper(*args: Any) -> Any:
-        # Make headroom before executing the task
-        headroom = 0
-        probable_io_task = True
-        for arg in args:
-            if isinstance(arg, SpillableWrapper):
-                if arg.mem_type() == MemoryType.HOST:
-                    headroom += sizeof(arg._on_host)
-                probable_io_task = False
-        if probable_io_task:
-            # Likely an IO task - Assume we need target_partition_size
-            headroom = target_partition_size
-        if headroom > 128_000_000:  # Don't waste time on smaller data
-            ctx = get_worker_context(get_worker())
-            with ctx.lock:
-                ctx.br.spill_manager.spill_to_make_headroom(headroom=headroom)
-
-        ret: Any = func(*(unwrap_arg(arg) for arg in args))
-        if make_func_output_spillable:
-            ret = wrap_arg(ret)
-        return ret
-
-    return wrapper
-
-
-def wrap_dataframe_in_spillable(
-    graph: MutableMapping[Any, Any],
-    ignore_key: str | tuple[str, int],
-    config_options: ConfigOptions[StreamingExecutor],
-) -> MutableMapping[Any, Any]:
-    """
-    Wraps functions within a task graph to handle spillable DataFrames.
-
-    Only supports flat task graphs where each DataFrame can be found in the
-    outermost level. Currently, this is true for all cudf-polars task graphs.
-
-    Parameters
-    ----------
-    graph
-        Task graph.
-    ignore_key
-        The key to ignore when wrapping function, typically the key of the
-        output node.
-    config_options
-        GPUEngine configuration options.
-
-    Returns
-    -------
-    A new task graph with wrapped functions.
-    """
-    target_partition_size = config_options.executor.target_partition_size
-
-    ret = {}
-    for key, task in graph.items():
-        assert isinstance(task, tuple)
-        ret[key] = tuple(
-            wrap_func_spillable(
-                a,
-                make_func_output_spillable=key != ignore_key,
-                target_partition_size=target_partition_size,
-            )
-            if callable(a)
-            else a
-            for a in task
-        )
-    return ret
diff --git a/python/cudf_polars/cudf_polars/utils/config.py b/python/cudf_polars/cudf_polars/utils/config.py
index 84418e2dd06..a6bbd73929b 100644
--- a/python/cudf_polars/cudf_polars/utils/config.py
+++ b/python/cudf_polars/cudf_polars/utils/config.py
@@ -121,15 +121,6 @@ def rapidsmpf_single_available() -> bool:  # pragma: no cover
         return False
 
 
-@functools.cache
-def rapidsmpf_distributed_available() -> bool:  # pragma: no cover
-    """Query whether rapidsmpf is available as a distributed shuffle method."""
-    try:
-        return importlib.util.find_spec("rapidsmpf.integrations.dask") is not None
-    except (ImportError, ValueError):
-        return False
-
-
 class StreamingFallbackMode(enum.StrEnum):
     """
     How the streaming executor handles operations that don't support multiple partitions.
@@ -165,15 +156,17 @@ class Cluster(enum.StrEnum):
     """
     The cluster configuration for the streaming executor.
 
-    * ``Cluster.SINGLE`` : Single-GPU execution. Currently uses a zero-dependency,
+    * ``Cluster.SINGLE`` : Single-GPU execution. Uses a zero-dependency,
       synchronous, single-threaded task scheduler.
-    * ``Cluster.DISTRIBUTED`` : Multi-GPU distributed execution. Currently
-      uses a Dask-based distributed scheduler and requires an
-      active Dask cluster.
+    * ``Cluster.SPMD`` : Multi-GPU SPMD execution via the rapidsmpf streaming
+      runtime.
+    * ``Cluster.RAY`` : Multi-GPU execution via Ray actors and the rapidsmpf
+      streaming runtime.
+    * ``Cluster.DASK`` : Multi-GPU execution via Dask workers and the rapidsmpf
+      streaming runtime.
     """
 
     SINGLE = "single"
-    DISTRIBUTED = "distributed"
     SPMD = "spmd"
     RAY = "ray"
     DASK = "dask"
@@ -188,8 +181,7 @@ class ShuffleMethod(enum.StrEnum):
     * ``ShuffleMethod._RAPIDSMPF_SINGLE`` : Use the single-process rapidsmpf shuffler.
 
     With :class:`cudf_polars.utils.config.StreamingExecutor`, the default of ``None``
-    will attempt to use ``ShuffleMethod.RAPIDSMPF`` for a distributed cluster,
-    but will fall back to ``ShuffleMethod.TASKS`` if rapidsmpf is not installed.
+    resolves to ``ShuffleMethod.TASKS``.
 
     The user should **not** specify ``ShuffleMethod._RAPIDSMPF_SINGLE`` directly.
     A setting of ``ShuffleMethod.RAPIDSMPF`` will be converted to the single-process
@@ -357,16 +349,14 @@ def default_broadcast_join_limit(cluster: str, runtime: str) -> int:
         # default_target_partition_size is used to set the
         # target partition size (i.e. 5x the 2.5% default).
         return min(5, int(max(1, (device_size * 0.125) // 1e9)))
-    elif (
-        cluster == "single"
-        and _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) == 1
-    ):
-        # We can lean on UVM to support most broadcast joins.
+    elif _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) == 1:
+        # The "tasks" runtime always runs single-GPU; we can lean on UVM
+        # to support most broadcast joins.
         return 32
     else:
-        # Extra-conservative default for the "tasks" runtime.
-        # We cannot spill outside a rapidsmpf shuffle within
-        # this runtime. So, shuffling is usually preferred.
+        # Extra-conservative default for the "tasks" runtime without UVM.
+        # We cannot spill outside a rapidsmpf shuffle within this runtime,
+        # so shuffling is usually preferred.
         return 2
 
 
@@ -616,11 +606,10 @@ class StreamingExecutor:
         The cluster configuration for the streaming executor.
         ``Cluster.SINGLE`` by default.
 
-        This setting applies to both task-based and rapidsmpf execution modes:
-
         * ``Cluster.SINGLE``: Single-GPU execution
-        * ``Cluster.DISTRIBUTED``: Multi-GPU distributed execution (requires
-          an active Dask cluster)
+        * ``Cluster.SPMD``: Multi-GPU SPMD execution (rapidsmpf runtime)
+        * ``Cluster.RAY``: Multi-GPU Ray execution (rapidsmpf runtime)
+        * ``Cluster.DASK``: Multi-GPU Dask execution (rapidsmpf runtime)
 
     fallback_mode
         How to handle errors when the GPU engine fails to execute a query.
@@ -653,7 +642,7 @@ class StreamingExecutor:
         By default, cudf-polars uses a target partition size that's a fraction
         of the device memory, where the fraction depends on the cluster and runtime:
 
-        - distributed cluster or rapidsmpf runtime: 1/40th of the device memory
+        - rapidsmpf runtime: 1/40th of the device memory
         - single cluster and tasks runtime: 1/16th of the device memory
 
         The pynvml library is used to query the total device memory on the first
@@ -677,20 +666,16 @@ class StreamingExecutor:
         on the cluster and runtime.
     shuffle_method
         The method to use for shuffling data between workers. Defaults to
-        'rapidsmpf' for distributed cluster if available (otherwise 'tasks'),
-        and 'tasks' for single-GPU cluster.
-    rapidsmpf_spill
-        Whether to wrap task arguments and output in objects that are
-        spillable by 'rapidsmpf'.
+        'tasks' for the single-GPU cluster.
     client_device_threshold
         Threshold for spilling data from device memory in rapidsmpf.
         Default is 50% of device memory on the client process.
         This argument is only used by the "rapidsmpf" runtime.
     sink_to_directory
         Whether multi-partition sink operations write to a directory rather
-        than a single file. For the distributed, spmd, ray, and dask clusters
-        this is always True; setting it to False raises a ValueError.
-        Defaults to False for the single-GPU cluster.
+        than a single file. For the spmd, ray, and dask clusters this is
+        always True; setting it to False raises a ValueError. Defaults to
+        False for the single-GPU cluster.
     dynamic_planning
         Options controlling dynamic shuffle planning. See
         :class:`~cudf_polars.utils.config.DynamicPlanningOptions` for more.
@@ -709,9 +694,7 @@ class StreamingExecutor:
     Notes
     -----
     The streaming executor does not currently support profiling a query via
-    the ``.profile()`` method. We recommend using nsys to profile queries
-    with single-GPU execution and Dask's built-in profiling tools
-    with distributed execution.
+    the ``.profile()`` method. We recommend using nsys to profile queries.
     """
 
     _env_prefix = "CUDF_POLARS__EXECUTOR"
@@ -770,11 +753,6 @@ class StreamingExecutor:
             default=ShuffleMethod.TASKS,
         )
     )
-    rapidsmpf_spill: bool = dataclasses.field(
-        default_factory=_make_default_factory(
-            f"{_env_prefix}__RAPIDSMPF_SPILL", _bool_converter, default=False
-        )
-    )
     client_device_threshold: float = dataclasses.field(
         default_factory=_make_default_factory(
             f"{_env_prefix}__CLIENT_DEVICE_THRESHOLD", float, default=0.5
@@ -820,23 +798,14 @@ def __post_init__(self) -> None:  # noqa: D105
 
         # Handle shuffle_method defaults for streaming executor
         if self.shuffle_method is None:
-            if self.cluster == "distributed" and rapidsmpf_distributed_available():
-                # For distributed cluster, prefer rapidsmpf if available
-                object.__setattr__(self, "shuffle_method", "rapidsmpf")
-            else:
-                # Otherwise, use task-based shuffle for now.
-                # TODO: Evaluate single-process shuffle by default.
-                object.__setattr__(self, "shuffle_method", "tasks")
+            # Use task-based shuffle by default.
+            # TODO: Evaluate single-process shuffle by default.
+            object.__setattr__(self, "shuffle_method", "tasks")
         elif self.shuffle_method == "rapidsmpf-single":
             # The user should NOT specify "rapidsmpf-single" directly.
             raise ValueError("rapidsmpf-single is not a supported shuffle method.")
         elif self.shuffle_method == "rapidsmpf":
-            # Check that we have rapidsmpf installed
-            if self.cluster == "distributed" and not rapidsmpf_distributed_available():
-                raise ValueError(
-                    "rapidsmpf shuffle method requested, but rapidsmpf.integrations.dask is not installed."
-                )
-            elif self.cluster == "single" and not rapidsmpf_single_available():
+            if self.cluster == "single" and not rapidsmpf_single_available():
                 raise ValueError(
                     "rapidsmpf shuffle method requested, but rapidsmpf is not installed."
                 )
@@ -872,7 +841,7 @@ def __post_init__(self) -> None:  # noqa: D105
                 DynamicPlanningOptions(**self.dynamic_planning),
             )
 
-        if self.cluster in ("distributed", "spmd", "ray", "dask"):
+        if self.cluster in ("spmd", "ray", "dask"):
             if self.sink_to_directory is False:
                 raise ValueError(
                     f"The {self.cluster} cluster requires sink_to_directory=True"
@@ -892,8 +861,6 @@ def __post_init__(self) -> None:  # noqa: D105
             raise TypeError("groupby_n_ary must be an int")
         if not isinstance(self.broadcast_join_limit, int):
             raise TypeError("broadcast_join_limit must be an int")
-        if not isinstance(self.rapidsmpf_spill, bool):
-            raise TypeError("rapidsmpf_spill must be bool")
         if not isinstance(self.sink_to_directory, bool):
             raise TypeError("sink_to_directory must be bool")
         if not isinstance(self.client_device_threshold, float):
@@ -905,14 +872,6 @@ def __post_init__(self) -> None:  # noqa: D105
         if not isinstance(self.num_py_executors, int):
             raise TypeError("num_py_executors must be an int")
 
-        # RapidsMPF spill is only supported for distributed clusters for now.
-        # This is because the spilling API is still within the RMPF-Dask integration.
-        # (See https://github.com/rapidsai/rapidsmpf/issues/439)
-        if self.cluster == "single" and self.rapidsmpf_spill:  # pragma: no cover
-            raise ValueError(
-                "rapidsmpf_spill is not supported for single-GPU execution."
-            )
-
     def __hash__(self) -> int:  # noqa: D105
         # cardinality factory, a dict, isn't natively hashable. We'll dump it
         # to json and hash that.
diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py
index e62d4ce6f86..7ad45c06605 100644
--- a/python/cudf_polars/tests/conftest.py
+++ b/python/cudf_polars/tests/conftest.py
@@ -236,7 +236,7 @@ def pytest_addoption(parser):
         "--cluster",
         action="store",
         default="single",
-        choices=("single", "distributed"),
+        choices=("single",),
         help="Cluster to use for 'streaming' executor.",
     )
 
@@ -262,12 +262,6 @@ def pytest_configure(config):
     # apply globally rather than per-module.
     config.addinivalue_line("filterwarnings", "ignore::ResourceWarning")
 
-    if (
-        config.getoption("--cluster") == "distributed"
-        and config.getoption("--executor") != "streaming"
-    ):
-        raise pytest.UsageError("Distributed cluster requires --executor='streaming'")
-
     if config.getoption("--runtime") == "rapidsmpf":
         if config.getoption("--executor") == "in-memory":
             raise pytest.UsageError("Rapidsmpf runtime requires --executor='streaming'")
diff --git a/python/cudf_polars/tests/experimental/legacy/__init__.py b/python/cudf_polars/tests/experimental/legacy/__init__.py
deleted file mode 100644
index 0b14ab351bc..00000000000
--- a/python/cudf_polars/tests/experimental/legacy/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-"""Legacy testing."""
-
-from __future__ import annotations
-
-__all__: list[str] = []
diff --git a/python/cudf_polars/tests/experimental/legacy/conftest.py b/python/cudf_polars/tests/experimental/legacy/conftest.py
deleted file mode 100644
index 6139eaef0af..00000000000
--- a/python/cudf_polars/tests/experimental/legacy/conftest.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import annotations
-
-import os
-
-import pytest
-
-
-# scope="session" is important to not cause singificant slowdowns in CI
-# https://github.com/rapidsai/cudf/pull/20137
-@pytest.fixture(autouse=True, scope="session")
-def dask_cluster(pytestconfig, worker_id):
-    if (
-        pytestconfig.getoption("--cluster") == "distributed"
-        and pytestconfig.getoption("--executor") == "streaming"
-    ):
-        worker_count = int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "0"))
-        from dask import config
-        from dask_cuda import LocalCUDACluster
-
-        # Avoid "Sending large graph of size ..." warnings
-        # (We expect these for tests using literal/random arrays)
-        config.set({"distributed.admin.large-graph-warning-threshold": "20MB"})
-        if worker_count > 0:
-            # Avoid port conflicts with multiple test runners
-            worker_index = int(worker_id.removeprefix("gw"))
-            scheduler_port = 8800 + worker_index
-            dashboard_address = 8900 + worker_index
-        else:
-            scheduler_port = None
-            dashboard_address = None
-
-        n_workers = int(os.environ.get("CUDF_POLARS_NUM_WORKERS", "1"))
-
-        with (
-            LocalCUDACluster(
-                n_workers=n_workers,
-                scheduler_port=scheduler_port,
-                dashboard_address=dashboard_address,
-            ) as cluster,
-            cluster.get_client(),
-        ):
-            yield
-    else:
-        yield
diff --git a/python/cudf_polars/tests/experimental/legacy/test_distributed.py b/python/cudf_polars/tests/experimental/legacy/test_distributed.py
deleted file mode 100644
index dd8aedeca1f..00000000000
--- a/python/cudf_polars/tests/experimental/legacy/test_distributed.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import pytest
-
-import polars as pl
-
-from cudf_polars.testing.asserts import DEFAULT_RUNTIME
-from cudf_polars.testing.io import make_lazy_frame
-
-
-@pytest.mark.parametrize("source_format", ["frame", "parquet", "csv"])
-def test_simple_query_with_distributed_support(tmp_path, source_format) -> None:
-    # Test a trivial query that works for both the
-    # "tasks" and "rapidsmpf" runtimes in distributed mode.
-
-    # Check that we have a distributed cluster running.
-    # This tests must be run with: --cluster='distributed'
-    distributed = pytest.importorskip("distributed")
-    try:
-        client = distributed.get_client()
-    except ValueError:
-        pytest.skip(reason="Requires distributed execution.")
-
-    # check that we have a rapidsmpf cluster running
-    pytest.importorskip("rapidsmpf")
-    try:
-        from rapidsmpf.integrations.dask import bootstrap_dask_cluster
-
-        bootstrap_dask_cluster(client)
-    except ValueError:
-        pytest.skip(reason="Requires a rapidsmpf-bootstrapped cluster.")
-
-    # Setup the GPUEngine config
-    engine = pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "max_rows_per_partition": 2,
-            "cluster": "distributed",
-            "runtime": DEFAULT_RUNTIME,
-        },
-    )
-
-    # Create a simple DataFrame
-    df = pl.DataFrame(
-        {
-            "a": [1, 2, 3, 4, 5],
-            "b": [10, 20, 30, 40, 50],
-        }
-    )
-
-    # Create LazyFrame based on source format
-    if source_format == "frame":
-        lf = make_lazy_frame(df, fmt="frame")
-    else:
-        lf = make_lazy_frame(df, fmt=source_format, path=tmp_path, n_files=2)
-
-    # Simple query: select and filter
-    q = lf.select("a", "b").filter(pl.col("a") > 2)
-    result = q.collect(engine=engine)
-
-    # Check the result is correct
-    expected = df.lazy().select("a", "b").filter(pl.col("a") > 2).collect()
-    assert result.equals(expected)
diff --git a/python/cudf_polars/tests/experimental/legacy/test_explain.py b/python/cudf_polars/tests/experimental/legacy/test_explain.py
deleted file mode 100644
index 9d74d8d800a..00000000000
--- a/python/cudf_polars/tests/experimental/legacy/test_explain.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import pytest
-
-import polars as pl
-
-from cudf_polars.experimental.explain import explain_query, serialize_query
-from cudf_polars.testing.asserts import DEFAULT_CLUSTER, DEFAULT_RUNTIME
-from cudf_polars.testing.io import make_partitioned_source
-
-
-@pytest.fixture(scope="module")
-def df():
-    return pl.DataFrame(
-        {
-            "x": range(25_000),
-            "y": ["cat", "dog"] * 12_500,
-            "z": [1.0, 2.0] * 12_500,
-        }
-    )
-
-
-def test_explain_physical_plan(tmp_path, df):
-    make_partitioned_source(df, tmp_path, fmt="parquet", n_files=5)
-
-    q = (
-        pl.scan_parquet(tmp_path)
-        .filter((pl.col("x") < 40_000) & (pl.col("z") > 1.0))
-        .with_columns((pl.col("x") + pl.col("z")).alias("sum"))
-        .select(["sum", "y"])
-    )
-
-    engine = pl.GPUEngine(
-        executor="streaming",
-        raise_on_fail=True,
-        executor_options={
-            "target_partition_size": 10_000,
-            "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
-        },
-    )
-
-    plan = explain_query(q, engine)
-
-    if DEFAULT_RUNTIME == "tasks":
-        # rapidsmpf runtime does not split Scan nodes at lowering time
-        assert "UNION" in plan
-        assert "SPLITSCAN" in plan
-    assert "SELECT ('sum', 'y')" in plan or "PROJECTION ('sum', 'y')" in plan
-
-
-def test_shuffle_properties():
-    # Join with broadcast_join_limit=1 forces shuffle-based join, producing
-    # Shuffle nodes in the lowered plan.
-    left = pl.LazyFrame({"a": ["x", "y", "x"], "b": [1, 2, 3]})
-    right = pl.LazyFrame({"a": ["x", "y", "z"], "c": [4, 5, 6]})
-    q = left.join(right, on="a", how="inner")
-    engine = pl.GPUEngine(
-        executor="streaming",
-        raise_on_fail=True,
-        executor_options={
-            "max_rows_per_partition": 1,
-            "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
-            "shuffle_method": DEFAULT_RUNTIME,
-            "broadcast_join_limit": 1,
-            "dynamic_planning": None,  # Requires static planning
-        },
-    )
-    dag = serialize_query(q, engine)
-
-    shuffle_nodes = [n for n in dag.nodes.values() if n.type == "Shuffle"]
-    assert len(shuffle_nodes) >= 1, "Expected at least one Shuffle node in lowered plan"
-    node = shuffle_nodes[0]
-
-    if DEFAULT_RUNTIME == "tasks":
-        shuffle_method = "tasks"
-    elif DEFAULT_CLUSTER == "single":
-        shuffle_method = "rapidsmpf-single"
-    else:
-        shuffle_method = "rapidsmpf"
-
-    assert node.properties == {
-        "keys": ["a"],
-        "shuffle_method": shuffle_method,
-    }
diff --git a/python/cudf_polars/tests/experimental/legacy/test_parallel.py b/python/cudf_polars/tests/experimental/legacy/test_parallel.py
deleted file mode 100644
index b097262958a..00000000000
--- a/python/cudf_polars/tests/experimental/legacy/test_parallel.py
+++ /dev/null
@@ -1,127 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import pickle
-
-import pytest
-
-import polars as pl
-from polars.testing import assert_frame_equal
-
-from cudf_polars import Translator
-from cudf_polars.experimental.parallel import (
-    get_scheduler,
-    lower_ir_graph,
-    task_graph,
-)
-from cudf_polars.experimental.statistics import collect_statistics
-from cudf_polars.testing.asserts import (
-    DEFAULT_CLUSTER,
-    DEFAULT_RUNTIME,
-)
-from cudf_polars.utils.config import ConfigOptions
-
-
-@pytest.fixture(scope="module")
-def engine():
-    return pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "max_rows_per_partition": 2,
-            "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
-        },
-    )
-
-
-@pytest.mark.skipif(
-    DEFAULT_RUNTIME == "rapidsmpf",
-    reason="Uses explicit task graph.",
-)
-def test_single_cluster():
-    # Test that the single cluster clears
-    # the cache as tasks are executed.
-    engine = pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "max_rows_per_partition": 4,
-            "cluster": "single",
-            "runtime": DEFAULT_RUNTIME,
-        },
-    )
-    left = pl.LazyFrame(
-        {
-            "x": range(15),
-            "y": [1, 2, 3] * 5,
-            "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 3,
-        }
-    )
-    right = pl.LazyFrame(
-        {
-            "xx": range(6),
-            "y": [2, 4, 3] * 2,
-            "zz": [1, 2] * 3,
-        }
-    )
-    q = left.join(right, on="y").group_by("y").agg(pl.col("zz").mean()).sort(by="y")
-
-    config_options = ConfigOptions.from_polars_engine(engine)
-    ir = Translator(q._ldf.visit(), engine).translate_ir()
-    ir, partition_info = lower_ir_graph(
-        ir, config_options, collect_statistics(ir, config_options)
-    )
-    graph, key = task_graph(
-        ir,
-        partition_info,
-        config_options,
-    )
-    scheduler = get_scheduler(config_options)
-    cache = {}
-    result = scheduler(graph, key, cache=cache)
-    assert_frame_equal(result.to_polars(), q.collect())
-
-    # The cache should only contain the final result
-    assert set(cache) == {key}
-
-
-@pytest.mark.skipif(
-    DEFAULT_RUNTIME == "rapidsmpf",
-    reason="Uses explicit task graph.",
-)
-def test_task_graph_is_pickle_serializable(engine):
-    # Dask will fall back to using cloudpickle to serialize the task graph if
-    # necessary. We'd like to avoid that, since cloudpickle serialization /
-    # deserialization is typically slower than pickle.
-
-    left = pl.LazyFrame(
-        {
-            "a": [1, 2, 3, 1, None],
-            "b": [1, 2, 3, 4, 5],
-            "c": [2, 3, 4, 5, 6],
-        }
-    )
-    right = pl.LazyFrame(
-        {
-            "a": [1, 4, 3, 7, None, None, 1],
-            "c": [2, 3, 4, 5, 6, 7, 8],
-            "d": [6, None, 7, 8, -1, 2, 4],
-        }
-    )
-    q = left.join(right, on="a").group_by("a").agg(pl.col("c").sum())
-
-    config_options = ConfigOptions.from_polars_engine(engine)
-    ir = Translator(q._ldf.visit(), engine).translate_ir()
-    ir, partition_info = lower_ir_graph(
-        ir, config_options, collect_statistics(ir, config_options)
-    )
-    graph, _ = task_graph(
-        ir,
-        partition_info,
-        config_options,
-    )
-
-    pickle.loads(pickle.dumps(graph))  # no exception
diff --git a/python/cudf_polars/tests/experimental/legacy/test_shuffle.py b/python/cudf_polars/tests/experimental/legacy/test_shuffle.py
deleted file mode 100644
index f3c92c6450e..00000000000
--- a/python/cudf_polars/tests/experimental/legacy/test_shuffle.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-from typing import Literal, cast
-
-import pytest
-
-import polars as pl
-from polars.testing import assert_frame_equal
-
-from cudf_polars import Translator
-from cudf_polars.dsl.expr import Col, NamedExpr
-from cudf_polars.experimental.parallel import evaluate_streaming, lower_ir_graph
-from cudf_polars.experimental.shuffle import Shuffle
-from cudf_polars.experimental.statistics import collect_statistics
-from cudf_polars.testing.asserts import DEFAULT_CLUSTER, DEFAULT_RUNTIME
-from cudf_polars.utils.config import ConfigOptions
-
-SHUFFLE_METHODS = ["tasks", None] if DEFAULT_RUNTIME == "tasks" else [None]
-
-
-@pytest.fixture(scope="module", params=SHUFFLE_METHODS)
-def engine(request):
-    return pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "max_rows_per_partition": 4,
-            "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
-            "shuffle_method": request.param,
-        },
-    )
-
-
-@pytest.fixture(scope="module")
-def df():
-    return pl.LazyFrame(
-        {
-            "x": [1, 2, 3, 4, 5, 6, 7],
-            "y": [1, 1, 1, 1, 1, 1, 1],
-            "z": ["a", "b", "c", "d", "e", "f", "g"],
-        }
-    )
-
-
-def test_hash_shuffle(df: pl.LazyFrame, engine: pl.GPUEngine) -> None:
-    # Extract translated IR
-    qir = Translator(df._ldf.visit(), engine).translate_ir()
-
-    # Add first Shuffle node
-    keys = (NamedExpr("x", Col(qir.schema["x"], "x")),)
-    options = ConfigOptions.from_polars_engine(engine)
-    assert options.executor.name == "streaming"
-    qir1 = Shuffle(
-        qir.schema,
-        keys,
-        options.executor.shuffle_method,
-        qir,
-    )
-
-    # Add second Shuffle node (on the same keys)
-    qir2 = Shuffle(
-        qir.schema,
-        keys,
-        options.executor.shuffle_method,
-        qir1,
-    )
-
-    # Check that sequential shuffles on the same keys
-    # are replaced with a single shuffle node
-    partition_info = lower_ir_graph(qir2, options, collect_statistics(qir2, options))[1]
-    assert len([node for node in partition_info if isinstance(node, Shuffle)]) == 1
-
-    # Add second Shuffle node (on different keys)
-    keys2 = (NamedExpr("z", Col(qir.schema["z"], "z")),)
-    qir3 = Shuffle(
-        qir2.schema,
-        keys2,
-        options.executor.shuffle_method,
-        qir2,
-    )
-
-    # Check that we have an additional shuffle
-    # node after shuffling on different keys
-    partition_info = lower_ir_graph(qir3, options, collect_statistics(qir3, options))[1]
-    assert len([node for node in partition_info if isinstance(node, Shuffle)]) == 2
-
-    # Check that streaming evaluation works
-    result = evaluate_streaming(
-        qir3,
-        options,
-    )
-    # Cast needed because polars' EngineType "cpu" isn't publicly exported.
-    # https://github.com/pola-rs/polars/issues/17420
-    expect = df.collect(
-        engine=cast(Literal["auto", "in-memory", "streaming", "gpu"], "cpu")
-    )
-    assert_frame_equal(result, expect, check_row_order=False)
diff --git a/python/cudf_polars/tests/experimental/legacy/test_shuffler.py b/python/cudf_polars/tests/experimental/legacy/test_shuffler.py
deleted file mode 100644
index 04b6e5f2405..00000000000
--- a/python/cudf_polars/tests/experimental/legacy/test_shuffler.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import pytest
-
-import polars as pl
-
-from cudf_polars.testing.asserts import (
-    DEFAULT_CLUSTER,
-    DEFAULT_RUNTIME,
-    assert_gpu_result_equal,
-)
-
-REQUIRE_TASKS_RUNTIME = pytest.mark.skipif(
-    DEFAULT_RUNTIME != "tasks", reason="Requires 'tasks' runtime."
-)
-
-
-@REQUIRE_TASKS_RUNTIME
-@pytest.mark.parametrize("max_rows_per_partition", [1, 5])
-def test_join_rapidsmpf_single(max_rows_per_partition: int) -> None:
-    engine = pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "max_rows_per_partition": max_rows_per_partition,
-            "broadcast_join_limit": 2,
-            "shuffle_method": "rapidsmpf",
-            "cluster": "single",
-            "runtime": DEFAULT_RUNTIME,
-        },
-    )
-
-    left = pl.LazyFrame(
-        {
-            "x": range(15),
-            "y": [1, 2, 3] * 5,
-            "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 3,
-        }
-    )
-    right = pl.LazyFrame(
-        {
-            "xx": range(6),
-            "y": [2, 4, 3] * 2,
-            "zz": [1, 2] * 3,
-        }
-    )
-    q = left.join(right, on="y", how="inner")
-
-    assert_gpu_result_equal(q, engine=engine, check_row_order=False)
-
-
-@REQUIRE_TASKS_RUNTIME
-def test_sort_stable_rapidsmpf_warns():
-    engine = pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "max_rows_per_partition": 3,
-            "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
-            "shuffle_method": "rapidsmpf",
-            "fallback_mode": "warn",
-        },
-    )
-
-    df = pl.LazyFrame(
-        {
-            "x": range(15),
-            "y": [1, 2, 3] * 5,
-            "z": [1.0, 2.0, 3.0, 4.0, 5.0] * 3,
-        }
-    )
-
-    q = df.sort(by=["y", "z"], maintain_order=True)
-    with pytest.warns(UserWarning, match="Falling back to shuffle_method"):
-        assert_gpu_result_equal(q, engine=engine, check_row_order=True)
diff --git a/python/cudf_polars/tests/experimental/legacy/test_sort.py b/python/cudf_polars/tests/experimental/legacy/test_sort.py
deleted file mode 100644
index ce148879548..00000000000
--- a/python/cudf_polars/tests/experimental/legacy/test_sort.py
+++ /dev/null
@@ -1,152 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import pytest
-
-import polars as pl
-
-from cudf_polars.testing.asserts import (
-    DEFAULT_CLUSTER,
-    DEFAULT_RUNTIME,
-    assert_gpu_result_equal,
-)
-
-
-@pytest.fixture(scope="module")
-def engine():
-    return pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "max_rows_per_partition": 3,
-            "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
-            "shuffle_method": "tasks",
-            "fallback_mode": "raise",
-        },
-    )
-
-
-@pytest.fixture(scope="module")
-def engine_large():
-    return pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "max_rows_per_partition": 2_100,
-            "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
-            "shuffle_method": "tasks",
-            "fallback_mode": "raise",
-        },
-    )
-
-
-@pytest.fixture(scope="module")
-def df():
-    return pl.LazyFrame(
-        {
-            "x": [1, 2, 3, 4, 5, 6, 7],
-            "y": [1, 6, 7, 2, 5, 4, 3],
-            "z": ["e", "c", "b", "g", "a", "f", "d"],
-        }
-    )
-
-
-def large_frames():
-    x = [1.0] * 10_000
-    x[-1] = float("nan")
-    y = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] * 1000
-
-    yield pytest.param(
-        pl.LazyFrame(
-            {
-                "x": x,
-            }
-        ),
-        ["x"],
-        False,
-        id="all_equal_one_nan",
-    )
-
-    yield pytest.param(
-        pl.LazyFrame(
-            {
-                "x": x,
-                "y": y,
-            }
-        ),
-        ["x", "y"],
-        False,
-        id="two_cols",
-    )
-
-    idx = list(range(10_000))
-    yield pytest.param(
-        pl.LazyFrame(
-            {
-                "x": x,
-                "y": y,
-                "idx": idx,
-            }
-        ),
-        ["x", "y"],
-        True,
-        id="two_col_stable",
-    )
-
-
-def test_sort(df, engine):
-    q = df.sort(by=["y", "z"])
-    assert_gpu_result_equal(q, engine=engine)
-
-
-@pytest.mark.parametrize("large_df,by,stable", list(large_frames()))
-@pytest.mark.parametrize(
-    "nulls_last,descending", [(True, False), (True, True), (False, True)]
-)
-def test_large_sort(large_df, by, engine_large, stable, nulls_last, descending):
-    q = large_df.sort(
-        by=by, nulls_last=nulls_last, maintain_order=stable, descending=descending
-    )
-    assert_gpu_result_equal(q, engine=engine_large)
-
-
-def test_sort_head(df, engine):
-    q = df.sort(by=["y", "z"]).head(2)
-    assert_gpu_result_equal(q, engine=engine)
-
-
-def test_sort_tail(df, engine):
-    q = df.sort(by=["y", "z"]).tail(2)
-    assert_gpu_result_equal(q, engine=engine)
-
-
-@pytest.mark.parametrize("offset", [1, -4])
-def test_sort_slice(df, engine, offset):
-    # Slice in the middle, which distributed sorts need to be careful with
-    q = df.sort(by=["y", "z"]).slice(offset, 2)
-    with pytest.raises(
-        NotImplementedError,
-        match="This slice not supported for multiple partitions.",
-    ):
-        assert_gpu_result_equal(q, engine=engine)
-
-
-def test_sort_after_sparse_join():
-    engine = pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
-            "max_rows_per_partition": 4,
-        },
-    )
-
-    left = pl.LazyFrame({"foo": list(range(5)), "bar": list(range(5))})
-    right = pl.LazyFrame({"foo": list(range(1))})
-    q = left.join(right, on="foo", how="inner").sort(by=["foo"])
-    assert_gpu_result_equal(q, engine=engine)
diff --git a/python/cudf_polars/tests/experimental/test_dask_serialize.py b/python/cudf_polars/tests/experimental/test_dask_serialize.py
deleted file mode 100644
index 0d7c981bc52..00000000000
--- a/python/cudf_polars/tests/experimental/test_dask_serialize.py
+++ /dev/null
@@ -1,129 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import pytest
-from distributed.protocol import deserialize, serialize
-
-import polars as pl
-from polars.testing.asserts import assert_frame_equal
-
-import rmm
-from rmm.pylibrmm.stream import DEFAULT_STREAM
-
-from cudf_polars.containers import DataFrame
-from cudf_polars.experimental.dask_registers import register
-from cudf_polars.utils.cuda_stream import get_dask_cuda_stream
-
-# Must register serializers before running tests
-register()
-
-
-def convert_to_rmm(frame):
-    """Convert frame to RMM to simulate Dask UCX transfers."""
-    if hasattr(frame, "__cuda_array_interface__"):
-        buf = rmm.DeviceBuffer(size=frame.nbytes)
-        buf.copy_from_device(frame)
-        return buf
-    else:
-        return frame
-
-
-@pytest.mark.filterwarnings(
-    # If exceptions in threads aren't handled, they get raised as a warning by
-    # Pytest. The warnings raised by this test correspond to unhandled
-    # `ResourceWarning`s in `distributed.node`
-    #
-    # Since Pytest 8, these warnings get elevated to errors and exit the test
-    # suite, so we selectively filter them here if the unraisable exception
-    # concerns `socket.socket`
-    "ignore:.*socket.socket.*:pytest.PytestUnraisableExceptionWarning"
-)
-@pytest.mark.parametrize(
-    "polars_tbl",
-    [
-        pl.DataFrame(),
-        pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
-        pl.DataFrame({"a": [1, 2, 3]}),
-        pl.DataFrame({"a": [1], "b": [2], "c": [3]}),
-        pl.DataFrame({"a": ["a", "bb", "ccc"]}),
-        pl.DataFrame({"a": [1, 2, None], "b": [None, 3, 4]}),
-        pl.DataFrame({"a": range(int(1e7))}),
-    ],
-)
-@pytest.mark.parametrize("protocol", ["cuda", "cuda_rmm", "dask"])
-@pytest.mark.parametrize(
-    "context",
-    [
-        None,
-        {},
-        {
-            "stream": DEFAULT_STREAM,
-            "device_mr": rmm.mr.get_current_device_resource(),
-            "staging_device_buffer": rmm.DeviceBuffer(size=2**20),
-        },
-    ],
-)
-def test_dask_serialization_roundtrip(polars_tbl, protocol, context):
-    stream = get_dask_cuda_stream()
-    df = DataFrame.from_polars(polars_tbl, stream=stream)
-
-    cuda_rmm = protocol == "cuda_rmm"
-    protocol = "cuda" if protocol == "cuda_rmm" else protocol
-
-    header, frames = serialize(
-        df, on_error="raise", serializers=[protocol], context=context
-    )
-    if cuda_rmm:
-        # Simulate Dask UCX transfers
-        frames = [convert_to_rmm(f) for f in frames]
-    res = deserialize(header, frames, deserializers=[protocol])
-
-    assert_frame_equal(df.to_polars(), res.to_polars())
-
-    # Check that we can serialize individual columns
-    for column in df.columns:
-        expect = DataFrame([column], stream=df.stream)
-
-        header, frames = serialize(
-            column, on_error="raise", serializers=[protocol], context=context
-        )
-        if cuda_rmm:
-            # Simulate Dask UCX transfers
-            frames = [convert_to_rmm(f) for f in frames]
-        res = deserialize(header, frames, deserializers=[protocol])
-
-        assert_frame_equal(
-            expect.to_polars(), DataFrame([res], stream=df.stream).to_polars()
-        )
-
-
-def test_dask_serialization_error():
-    df = DataFrame.from_polars(
-        pl.DataFrame({"a": [1, 2, 3]}), stream=get_dask_cuda_stream()
-    )
-
-    header, frames = serialize(
-        df,
-        on_error="message",
-        serializers=["dask"],
-        context={
-            "device_mr": rmm.mr.get_current_device_resource(),
-            "staging_device_buffer": rmm.DeviceBuffer(size=2**20),
-        },
-    )
-    assert header == {"serializer": "error"}
-    assert "ValueError: " in str(frames)
-
-    header, frames = serialize(
-        df,
-        on_error="message",
-        serializers=["dask"],
-        context={
-            "stream": df.stream,
-            "staging_device_buffer": rmm.DeviceBuffer(size=2**20),
-        },
-    )
-    assert header == {"serializer": "error"}
-    assert "ValueError: " in str(frames)
diff --git a/python/cudf_polars/tests/experimental/test_dask_sizeof.py b/python/cudf_polars/tests/experimental/test_dask_sizeof.py
deleted file mode 100644
index d4f2d16039f..00000000000
--- a/python/cudf_polars/tests/experimental/test_dask_sizeof.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import pytest
-from dask.sizeof import sizeof
-
-import polars as pl
-
-from cudf_polars.containers import DataFrame
-from cudf_polars.experimental.dask_registers import register
-from cudf_polars.utils.cuda_stream import get_dask_cuda_stream
-
-# Must register sizeof dispatch before running tests
-register()
-
-
-@pytest.mark.parametrize(
-    "polars_tbl, size",
-    [
-        (pl.DataFrame(), 0),
-        (pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}), 9 * 8),
-        (pl.DataFrame({"a": [1, 2, 3]}), 3 * 8),
-        (pl.DataFrame({"a": ["a"], "b": ["bc"]}), 2 * 8 + 3),
-        (pl.DataFrame({"a": [1, 2, None]}), 88),
-    ],
-)
-def test_dask_sizeof(polars_tbl, size):
-    df = DataFrame.from_polars(polars_tbl, stream=get_dask_cuda_stream())
-    assert sizeof(df) == size
-    assert sum(sizeof(c) for c in df.columns) == size
diff --git a/python/cudf_polars/tests/experimental/test_dask_tokenize.py b/python/cudf_polars/tests/experimental/test_dask_tokenize.py
deleted file mode 100644
index 287c3a4c379..00000000000
--- a/python/cudf_polars/tests/experimental/test_dask_tokenize.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import dask.tokenize
-import pytest
-
-import polars as pl
-
-from cudf_polars.containers import DataType
-from cudf_polars.dsl.expressions.base import Col, NamedExpr
-from cudf_polars.experimental.dask_registers import register
-
-# Must register sizeof dispatch before running tests
-register()
-
-
-@pytest.mark.parametrize(
-    "value",
-    [
-        NamedExpr("a", Col(DataType(pl.Int64()), "a")),
-        DataType(pl.Int64()),
-    ],
-    ids=["named_expr", "data_type"],
-)
-def test_tokenize(value: DataType | NamedExpr) -> None:
-    normalizer = dask.tokenize.normalize_token.dispatch(type(value))
-    package = normalizer.__module__.split(".")[0]
-    assert package == "cudf_polars"
-
-    dask.tokenize.tokenize(value)
diff --git a/python/cudf_polars/tests/experimental/test_io_multirank.py b/python/cudf_polars/tests/experimental/test_io_multirank.py
index e1265602304..631f12fd85c 100644
--- a/python/cudf_polars/tests/experimental/test_io_multirank.py
+++ b/python/cudf_polars/tests/experimental/test_io_multirank.py
@@ -104,7 +104,7 @@ def test_sink_parquet_empty_rank(engine: StreamingEngine, tmp_path: Path) -> Non
 
 @pytest.mark.parametrize(
     "cluster",
-    [Cluster.SPMD, Cluster.RAY, Cluster.DASK, Cluster.DISTRIBUTED],
+    [Cluster.SPMD, Cluster.RAY, Cluster.DASK],
 )
 def test_sink_to_directory_false_raises(cluster: Cluster) -> None:
     """Explicit ``sink_to_directory=False`` is rejected for every multi-rank cluster."""
diff --git a/python/cudf_polars/tests/experimental/test_sink.py b/python/cudf_polars/tests/experimental/test_sink.py
index 4b28830f287..9b0573d2cb4 100644
--- a/python/cudf_polars/tests/experimental/test_sink.py
+++ b/python/cudf_polars/tests/experimental/test_sink.py
@@ -92,19 +92,6 @@ def test_sink_parquet_directory(
         assert len(list(check_path.iterdir())) == expected_file_count
 
 
-def test_sink_parquet_raises_distributed() -> None:
-    engine = pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "cluster": "distributed",
-            "sink_to_directory": False,
-        },
-    )
-    with pytest.raises(ValueError, match="distributed cluster"):
-        ConfigOptions.from_polars_engine(engine)
-
-
 def test_sink_parquet_raises_spmd(spmd_comm):
     from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
 
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 71f6c946184..3cd66bc527d 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -51,16 +51,6 @@ def rapidsmpf_single_available(request, monkeypatch):
     return request.param
 
 
-@pytest.fixture(params=[False, True], ids=["norapidsmpf.dask", "rapidsmpf.dask"])
-def rapidsmpf_distributed_available(request, monkeypatch):
-    monkeypatch.setattr(
-        cudf_polars.utils.config,
-        "rapidsmpf_distributed_available",
-        lambda: request.param,
-    )
-    return request.param
-
-
 def test_polars_verbose_warns(monkeypatch):
     def raise_unimplemented(self, *args):
         raise NotImplementedError("We don't support this")
@@ -243,7 +233,7 @@ def test_parquet_options_from_none() -> None:
 
 
 def test_validate_streaming_executor_shuffle_method(
-    *, rapidsmpf_distributed_available: bool, rapidsmpf_single_available: bool
+    *, rapidsmpf_single_available: bool
 ) -> None:
     config = ConfigOptions.from_polars_engine(
         pl.GPUEngine(
@@ -254,21 +244,6 @@ def test_validate_streaming_executor_shuffle_method(
     assert config.executor.name == "streaming"
     assert config.executor.shuffle_method == "tasks"
 
-    # rapidsmpf with distributed cluster
-    engine = pl.GPUEngine(
-        executor="streaming",
-        executor_options={"shuffle_method": "rapidsmpf", "cluster": "distributed"},
-    )
-    if rapidsmpf_distributed_available:
-        config = ConfigOptions.from_polars_engine(engine)
-        assert config.executor.name == "streaming"
-        assert config.executor.shuffle_method == "rapidsmpf"
-    else:
-        with pytest.raises(
-            ValueError, match="rapidsmpf.integrations.dask is not installed"
-        ):
-            ConfigOptions.from_polars_engine(engine)
-
     # rapidsmpf with single cluster
     engine = pl.GPUEngine(
         executor="streaming",
@@ -344,10 +319,7 @@ def test_validate_cluster() -> None:
         )
 
 
-def test_validate_shuffle_method_defaults(
-    *,
-    rapidsmpf_distributed_available: bool,
-) -> None:
+def test_validate_shuffle_method_defaults() -> None:
     config = ConfigOptions.from_polars_engine(
         pl.GPUEngine(
             executor="streaming",
@@ -356,20 +328,6 @@ def test_validate_shuffle_method_defaults(
     assert config.executor.name == "streaming"
     assert config.executor.shuffle_method == "tasks"  # Default for single cluster
 
-    # Test default for distributed cluster depends on rapidsmpf availability
-    config = ConfigOptions.from_polars_engine(
-        pl.GPUEngine(
-            executor="streaming",
-            executor_options={"cluster": "distributed"},
-        )
-    )
-    assert config.executor.name == "streaming"
-    if rapidsmpf_distributed_available:
-        # Should be "rapidsmpf" if available, otherwise "tasks"
-        assert config.executor.shuffle_method == "rapidsmpf"
-    else:
-        assert config.executor.shuffle_method == "tasks"
-
     with pytest.raises(ValueError, match="'foo' is not a valid ShuffleMethod"):
         ConfigOptions.from_polars_engine(
             pl.GPUEngine(
@@ -387,7 +345,6 @@ def test_validate_shuffle_method_defaults(
         "target_partition_size",
         "groupby_n_ary",
         "broadcast_join_limit",
-        "rapidsmpf_spill",
         "sink_to_directory",
         "client_device_threshold",
         "max_io_threads",
@@ -447,45 +404,31 @@ def test_parquet_options_from_env(monkeypatch: pytest.MonkeyPatch) -> None:
             ConfigOptions.from_polars_engine(engine)
 
 
-def test_config_option_from_env(
-    monkeypatch: pytest.MonkeyPatch, *, rapidsmpf_distributed_available: bool
-) -> None:
+def test_config_option_from_env(monkeypatch: pytest.MonkeyPatch) -> None:
     with monkeypatch.context() as m:
-        m.setenv("CUDF_POLARS__EXECUTOR__CLUSTER", "distributed")
+        m.setenv("CUDF_POLARS__EXECUTOR__CLUSTER", "single")
         m.setenv("CUDF_POLARS__EXECUTOR__FALLBACK_MODE", "silent")
         m.setenv("CUDF_POLARS__EXECUTOR__MAX_ROWS_PER_PARTITION", "42")
         m.setenv("CUDF_POLARS__EXECUTOR__UNIQUE_FRACTION", '{"a": 0.5}')
         m.setenv("CUDF_POLARS__EXECUTOR__TARGET_PARTITION_SIZE", "100")
         m.setenv("CUDF_POLARS__EXECUTOR__GROUPBY_N_ARY", "43")
         m.setenv("CUDF_POLARS__EXECUTOR__BROADCAST_JOIN_LIMIT", "44")
-        m.setenv("CUDF_POLARS__EXECUTOR__RAPIDSMPF_SPILL", "1")
-        m.setenv("CUDF_POLARS__EXECUTOR__SINK_TO_DIRECTORY", "1")
+        m.setenv("CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", "tasks")
         m.setenv("CUDF_POLARS__CUDA_STREAM_POLICY", "default")
 
-        if rapidsmpf_distributed_available:
-            m.setenv("CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", "rapidsmpf")
-        else:
-            m.setenv("CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", "tasks")
-
         engine = pl.GPUEngine()
         config = ConfigOptions.from_polars_engine(engine)
         assert config.executor.name == "streaming"
-        assert config.executor.cluster == "distributed"
+        assert config.executor.cluster == "single"
         assert config.executor.fallback_mode == "silent"
         assert config.executor.max_rows_per_partition == 42
         assert config.executor.unique_fraction == {"a": 0.5}
         assert config.executor.target_partition_size == 100
         assert config.executor.groupby_n_ary == 43
         assert config.executor.broadcast_join_limit == 44
-        assert config.executor.rapidsmpf_spill is True
-        assert config.executor.sink_to_directory is True
+        assert config.executor.shuffle_method == "tasks"
         assert config.cuda_stream_policy is None
 
-        if rapidsmpf_distributed_available:
-            assert config.executor.shuffle_method == "rapidsmpf"
-        else:
-            assert config.executor.shuffle_method == "tasks"
-
 
 def test_target_partition_from_env(
     monkeypatch: pytest.MonkeyPatch, recwarn: pytest.WarningsRecorder
@@ -955,11 +898,11 @@ def test_num_py_executors_from_env(
         assert config.executor.num_py_executors == 8
 
 
-def test_distributed_sink_to_directory_false_raises() -> None:
+def test_dask_sink_to_directory_false_raises() -> None:
     with pytest.raises(
-        ValueError, match="The distributed cluster requires sink_to_directory=True"
+        ValueError, match="The dask cluster requires sink_to_directory=True"
     ):
-        StreamingExecutor(cluster=Cluster.DISTRIBUTED, sink_to_directory=False)
+        StreamingExecutor(cluster=Cluster.DASK, sink_to_directory=False)
 
 
 def test_get_dask_cuda_stream() -> None:
diff --git a/python/cudf_polars/tests/test_profile.py b/python/cudf_polars/tests/test_profile.py
index c0d5de759ad..cf48dc933d7 100644
--- a/python/cudf_polars/tests/test_profile.py
+++ b/python/cudf_polars/tests/test_profile.py
@@ -26,11 +26,10 @@ def test_profile_basic() -> None:
     assert_frame_equal(result, q.collect(engine="in-memory"), check_row_order=False)
 
 
-@pytest.mark.parametrize("cluster", ["single", "distributed"])
-def test_profile_streaming_raises(cluster: str) -> None:
+def test_profile_streaming_raises() -> None:
     df = pl.LazyFrame({"a": [1, 2, 3, 4]})
     q = df.sort("a").group_by("a").len()
-    engine = pl.GPUEngine(executor="streaming", executor_options={"cluster": cluster})
+    engine = pl.GPUEngine(executor="streaming", executor_options={"cluster": "single"})
     with pytest.raises(
         NotImplementedError,
         match=r"profile\(\) is not supported with the streaming executor.",

From 899508569e3ab17c60df230514745f4a0ec7cd75 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 4 May 2026 13:28:50 -0500
Subject: [PATCH 05/36] Fix StatsCollector.serialize to use value equality
 instead of object identity (#22366)

Uses node directly as the dict key instead of `id(node)`, so nodes reconstructed on workers (introduced in #22287) are found correctly by value rather than failing with a `KeyError`.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Tom Augspurger (https://github.com/TomAugspurger)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/cudf/pull/22366
---
 .../cudf_polars/experimental/base.py          |  4 ++--
 .../tests/experimental/test_stats.py          | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py
index be3eb6aeb53..73ed9b3dbe1 100644
--- a/python/cudf_polars/cudf_polars/experimental/base.py
+++ b/python/cudf_polars/cudf_polars/experimental/base.py
@@ -118,9 +118,9 @@ def serialize(self, ir: IR) -> list[SerializedStatsEntry]:
         traversal of *ir* so that the result is independent of object
         identity.
         """
-        node_to_idx = {id(node): i for i, node in enumerate(traversal([ir]))}
+        node_to_idx = {node: i for i, node in enumerate(traversal([ir]))}
         return [
-            {"index": node_to_idx[id(node)], "info": info.serialize()}
+            {"index": node_to_idx[node], "info": info.serialize()}
             for node, info in self.scan_stats.items()
         ]
 
diff --git a/python/cudf_polars/tests/experimental/test_stats.py b/python/cudf_polars/tests/experimental/test_stats.py
index 2b08a23b862..7d0d5dc01e4 100644
--- a/python/cudf_polars/tests/experimental/test_stats.py
+++ b/python/cudf_polars/tests/experimental/test_stats.py
@@ -12,6 +12,8 @@
 import polars as pl
 
 from cudf_polars import Translator
+from cudf_polars.containers import DataType
+from cudf_polars.dsl.ir import Empty, Projection
 from cudf_polars.experimental.base import SerializedDataSourceInfo, StatsCollector
 from cudf_polars.experimental.io import (
     DataFrameSourceInfo,
@@ -266,3 +268,20 @@ def test_serialize_stats_roundtrip_parquet(
         assert rt.row_count == info.row_count
         for col in ("x", "y", "z"):
             assert rt.column_storage_size(col) == info.column_storage_size(col)
+
+
+def test_serialize_uses_value_equality() -> None:
+    schema = {"x": DataType(pl.Int64())}
+    scan_x = Empty(schema)
+    scan_y = Empty(schema)
+    assert scan_x == scan_y
+    assert scan_x is not scan_y
+
+    root = Projection(schema, scan_y)
+
+    stats = StatsCollector()
+    stats.scan_stats[scan_x] = DataFrameSourceInfo(100)
+
+    result = stats.serialize(root)
+    assert len(result) == 1
+    assert result[0]["index"] >= 0

From 5255d5118ed3ce606bf56478711421a641bd72e0 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 4 May 2026 12:02:30 -0700
Subject: [PATCH 06/36] Pass managed pool MR explicitly in NDSH parquet data
 generation (#22344)

Pass the managed-pool MR directly into each `cudf::datagen::generate_*` call instead of swapping it in as the current device resource and restoring on exit. Also fixes forwarding of the mr parameter down the datagen stack.

There are still a few tiny allocations (KBs) that use the default mr because switching would require a copy. These should not cause OOM errors.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/cudf/pull/22344
---
 .../ndsh_data_generator.cpp                   | 19 ++++++------
 .../ndsh_data_generator/table_helpers.cpp     | 13 ++++----
 cpp/benchmarks/ndsh/utilities.cpp             | 30 +++++++------------
 3 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
index 6bdd3a0d87e..587758d84bb 100644
--- a/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
+++ b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
@@ -379,7 +379,8 @@ std::unique_ptr<cudf::table> generate_lineitem_partial(cudf::table_view const& o
     auto const pred =
       cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref, current_date_literal);
     auto mask = cudf::compute_column(cudf::table_view({l_shipdate_ts->view()}), pred, stream, mr);
-    auto mask_index_type      = cudf::cast(mask->view(), cudf::data_type{cudf::type_id::INT8});
+    auto mask_index_type =
+      cudf::cast(mask->view(), cudf::data_type{cudf::type_id::INT8}, stream, mr);
     auto const indices        = cudf::test::fixed_width_column_wrapper<int8_t>({0, 1}).release();
     auto const keys           = cudf::test::strings_column_wrapper({"O", "F"}).release();
     auto const gather_map     = cudf::table_view({indices->view(), keys->view()});
@@ -465,7 +466,7 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
     requests[1].values = l_linestatus_mask;
 
     // Perform the aggregations
-    auto agg_result = gb.aggregate(requests);
+    auto agg_result = gb.aggregate(requests, stream, mr);
 
     // Create a `table_view` out of the `l_orderkey`, `count`, and `sum` columns
     auto const count = std::move(agg_result.second[0].results[0]);
@@ -484,9 +485,9 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
     auto const count_ref = cudf::ast::column_reference(1);
     auto const sum_ref   = cudf::ast::column_reference(2);
     auto const expr_a    = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, sum_ref, count_ref);
-    auto const mask_a    = cudf::compute_column(table, expr_a);
-    auto const o_orderstatus_intermediate =
-      cudf::copy_if_else(cudf::string_scalar("O"), cudf::string_scalar("F"), mask_a->view());
+    auto const mask_a    = cudf::compute_column(table, expr_a, stream, mr);
+    auto const o_orderstatus_intermediate = cudf::copy_if_else(
+      cudf::string_scalar("O"), cudf::string_scalar("F"), mask_a->view(), stream, mr);
 
     // Then, we evaluate an expression `sum == 0` and generate a boolean mask
     auto zero_scalar        = cudf::numeric_scalar<cudf::size_type>(0);
@@ -497,9 +498,9 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
       cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, sum_ref, zero_literal);
     auto const expr_b =
       cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_b_left, expr_b_right);
-    auto const mask_b = cudf::compute_column(table, expr_b);
+    auto const mask_b = cudf::compute_column(table, expr_b, stream, mr);
     return cudf::copy_if_else(
-      cudf::string_scalar("P"), o_orderstatus_intermediate->view(), mask_b->view());
+      cudf::string_scalar("P"), o_orderstatus_intermediate->view(), mask_b->view(), stream, mr);
   }();
   orders_dependent_columns.push_back(std::move(o_orderstatus));
 
@@ -514,7 +515,7 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
     requests.push_back(cudf::groupby::aggregation_request());
     requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
     requests[0].values = l_charge->view();
-    auto agg_result    = gb.aggregate(requests);
+    auto agg_result    = gb.aggregate(requests, stream, mr);
     return std::move(agg_result.second[0].results[0]);
   }();
   orders_dependent_columns.push_back(std::move(o_totalprice));
@@ -726,7 +727,7 @@ generate_orders_lineitem_part(double scale_factor,
     auto joined_table_columns = joined_table->release();
     auto const l_quantity     = std::move(joined_table_columns[1]);
     auto const l_quantity_fp =
-      cudf::cast(l_quantity->view(), cudf::data_type{cudf::type_id::FLOAT64});
+      cudf::cast(l_quantity->view(), cudf::data_type{cudf::type_id::FLOAT64}, stream, mr);
     auto const p_retailprice = std::move(joined_table_columns[3]);
     return cudf::binary_operation(l_quantity_fp->view(),
                                   p_retailprice->view(),
diff --git a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
index ef6cc4971e6..8510e9d6c62 100644
--- a/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
+++ b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
@@ -45,8 +45,9 @@ std::unique_ptr<cudf::column> add_calendrical_days(cudf::column_view const& time
                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_BENCHMARK_RANGE();
-  auto const days_duration_type = cudf::cast(days, cudf::data_type{cudf::type_id::DURATION_DAYS});
-  auto const data_type          = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS};
+  auto const days_duration_type =
+    cudf::cast(days, cudf::data_type{cudf::type_id::DURATION_DAYS}, stream, mr);
+  auto const data_type = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS};
   return cudf::binary_operation(
     timestamp_days, days_duration_type->view(), cudf::binary_operator::ADD, data_type, stream, mr);
 }
@@ -358,13 +359,13 @@ std::unique_ptr<cudf::table> perform_left_join(cudf::table_view const& left_inpu
 {
   CUDF_BENCHMARK_RANGE();
   auto const part_a = cudf::strings::from_integers(
-    generate_random_numeric_column<int16_t>(10, 34, num_rows, stream, mr)->view());
+    generate_random_numeric_column<int16_t>(10, 34, num_rows, stream, mr)->view(), stream, mr);
   auto const part_b = cudf::strings::from_integers(
-    generate_random_numeric_column<int16_t>(100, 999, num_rows, stream, mr)->view());
+    generate_random_numeric_column<int16_t>(100, 999, num_rows, stream, mr)->view(), stream, mr);
   auto const part_c = cudf::strings::from_integers(
-    generate_random_numeric_column<int16_t>(100, 999, num_rows, stream, mr)->view());
+    generate_random_numeric_column<int16_t>(100, 999, num_rows, stream, mr)->view(), stream, mr);
   auto const part_d = cudf::strings::from_integers(
-    generate_random_numeric_column<int16_t>(1000, 9999, num_rows, stream, mr)->view());
+    generate_random_numeric_column<int16_t>(1000, 9999, num_rows, stream, mr)->view(), stream, mr);
   auto const phone_parts_table =
     cudf::table_view({part_a->view(), part_b->view(), part_c->view(), part_d->view()});
   return cudf::strings::concatenate(phone_parts_table,
diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp
index 61c9c9d32e9..89315bacec1 100644
--- a/cpp/benchmarks/ndsh/utilities.cpp
+++ b/cpp/benchmarks/ndsh/utilities.cpp
@@ -393,13 +393,9 @@ void generate_parquet_data_sources(double scale_factor,
 {
   CUDF_BENCHMARK_RANGE();
 
-  // Set the memory resource to the managed pool
-  auto old_mr = cudf::get_current_device_resource_ref();
-  // TODO: if old_mr is already managed pool or managed, don't create new one.
+  // Use a managed pool for parquet generation.
   rmm::mr::pool_memory_resource managed_pool_mr{rmm::mr::managed_memory_resource{},
                                                 rmm::percent_of_free_device_memory(50)};
-  cudf::set_current_device_resource(managed_pool_mr);
-  // drawback: if already pool takes 50% of free memory, we are left with 50% of 50% of free memory
 
   std::unordered_set<std::string> const requested_table_names = [&table_names]() {
     if (table_names.empty()) {
@@ -414,9 +410,11 @@ void generate_parquet_data_sources(double scale_factor,
     });
   std::unordered_map<std::string, std::unique_ptr<cudf::table>> tables;
 
+  auto const stream = cudf::get_default_stream();
+
   if (sources.count("orders") or sources.count("lineitem") or sources.count("part")) {
-    auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
-      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    auto [orders, lineitem, part] =
+      cudf::datagen::generate_orders_lineitem_part(scale_factor, stream, managed_pool_mr);
     if (sources.count("orders")) {
       write_to_parquet_device_buffer(orders, SCHEMAS.at("orders"), sources.at("orders"));
       orders = {};
@@ -432,35 +430,27 @@ void generate_parquet_data_sources(double scale_factor,
   }
 
   if (sources.count("partsupp")) {
-    auto partsupp = cudf::datagen::generate_partsupp(
-      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    auto partsupp = cudf::datagen::generate_partsupp(scale_factor, stream, managed_pool_mr);
     write_to_parquet_device_buffer(partsupp, SCHEMAS.at("partsupp"), sources.at("partsupp"));
   }
 
   if (sources.count("supplier")) {
-    auto supplier = cudf::datagen::generate_supplier(
-      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    auto supplier = cudf::datagen::generate_supplier(scale_factor, stream, managed_pool_mr);
     write_to_parquet_device_buffer(supplier, SCHEMAS.at("supplier"), sources.at("supplier"));
   }
 
   if (sources.count("customer")) {
-    auto customer = cudf::datagen::generate_customer(
-      scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+    auto customer = cudf::datagen::generate_customer(scale_factor, stream, managed_pool_mr);
     write_to_parquet_device_buffer(customer, SCHEMAS.at("customer"), sources.at("customer"));
   }
 
   if (sources.count("nation")) {
-    auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
-                                                 cudf::get_current_device_resource_ref());
+    auto nation = cudf::datagen::generate_nation(stream, managed_pool_mr);
     write_to_parquet_device_buffer(nation, SCHEMAS.at("nation"), sources.at("nation"));
   }
 
   if (sources.count("region")) {
-    auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
-                                                 cudf::get_current_device_resource_ref());
+    auto region = cudf::datagen::generate_region(stream, managed_pool_mr);
     write_to_parquet_device_buffer(region, SCHEMAS.at("region"), sources.at("region"));
   }
-
-  // Restore the original memory resource
-  cudf::set_current_device_resource(old_mr);
 }

From 9407fd6686805881559702388a43503bc4726735 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 4 May 2026 21:31:53 +0000
Subject: [PATCH 07/36] Fix compile warnings in libcudf examples (#22335)

Fixes some compile warnings in the libcudf tests. These are deprecation warnings about the missing alignment parameter for the custom allocators in the `hybrid_scan_io` and `parquet_io` examples.

```
/cudf/cpp/examples/parquet_io/io_source.hpp:61:66: warning: 'void cuda::mr::__4::__ibasic_async_resource< <template-parameter-1-1> >::deallocate(cuda::__4::stream_ref, void*, size_t) [with <template-parameter-1-1> = {cuda::__4::__ireference<cuda::__4::__iset_<cuda::mr::__4::__ibasic_async_resource<>, cuda::mr::__4::__ibasic_resource<>, cuda::mr::__4::__with_property<cuda::mr::__4::dynamic_accessibility_property>::__iproperty<>, cuda::mr::__4::__with_property<cuda::mr::__4::host_accessible>::__iproperty<>, cuda::__4::__icopyable<>, cuda::__4::__iequality_comparable<> > >}; size_t = long unsigned int]' is deprecated: Specify an explicit alignment argument. The default alignment will be removed in a future release. [-Wdeprecated-declarations]
   61 |   void deallocate(T* ptr, std::size_t n) noexcept { mr.deallocate(stream, ptr, n * sizeof(T)); }

```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/22335
---
 cpp/examples/hybrid_scan_io/io_source.hpp | 7 +++++--
 cpp/examples/parquet_io/io_source.hpp     | 9 ++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/cpp/examples/hybrid_scan_io/io_source.hpp b/cpp/examples/hybrid_scan_io/io_source.hpp
index 12670d2208d..500e9476f50 100644
--- a/cpp/examples/hybrid_scan_io/io_source.hpp
+++ b/cpp/examples/hybrid_scan_io/io_source.hpp
@@ -53,12 +53,15 @@ struct pinned_allocator : public std::allocator<T> {
 
   T* allocate(std::size_t n)
   {
-    auto ptr = mr.allocate(stream, n * sizeof(T));
+    auto ptr = mr.allocate(stream, n * sizeof(T), alignof(T));
     stream.synchronize();
     return static_cast<T*>(ptr);
   }
 
-  void deallocate(T* ptr, std::size_t n) noexcept { mr.deallocate(stream, ptr, n * sizeof(T)); }
+  void deallocate(T* ptr, std::size_t n) noexcept
+  {
+    mr.deallocate(stream, ptr, n * sizeof(T), alignof(T));
+  }
 
  private:
   rmm::host_async_resource_ref mr;
diff --git a/cpp/examples/parquet_io/io_source.hpp b/cpp/examples/parquet_io/io_source.hpp
index 6862b326ca9..713d35d8044 100644
--- a/cpp/examples/parquet_io/io_source.hpp
+++ b/cpp/examples/parquet_io/io_source.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -53,12 +53,15 @@ struct pinned_allocator : public std::allocator<T> {
 
   T* allocate(std::size_t n)
   {
-    auto ptr = mr.allocate(stream, n * sizeof(T));
+    auto ptr = mr.allocate(stream, n * sizeof(T), alignof(T));
     stream.synchronize();
     return static_cast<T*>(ptr);
   }
 
-  void deallocate(T* ptr, std::size_t n) noexcept { mr.deallocate(stream, ptr, n * sizeof(T)); }
+  void deallocate(T* ptr, std::size_t n) noexcept
+  {
+    mr.deallocate(stream, ptr, n * sizeof(T), alignof(T));
+  }
 
  private:
   rmm::host_async_resource_ref mr;

From 0e82b62836340a503f8cfdcd8a875524a6654f03 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <12716979+PointKernel@users.noreply.github.com>
Date: Mon, 4 May 2026 15:52:42 -0700
Subject: [PATCH 08/36] Add skip axis to all join benchmarks (#22241)

This PR updates the join benchmarks to include a skip axis, allowing users to optionally include large table sizes, which is not possible in the current setup due to its unconditional skip of those sizes.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/22241
---
 cpp/benchmarks/join/conditional_join.cu    |  8 ++--
 cpp/benchmarks/join/distinct_join.cu       |  6 ++-
 cpp/benchmarks/join/filter_join_indices.cu | 14 ++++--
 cpp/benchmarks/join/join.cu                | 12 +++--
 cpp/benchmarks/join/join_common.hpp        | 33 +++++++++----
 cpp/benchmarks/join/left_join.cu           | 56 +++++++++-------------
 cpp/benchmarks/join/mixed_join.cu          | 20 +++++---
 cpp/benchmarks/join/multiplicity_join.cu   |  9 ++--
 cpp/benchmarks/join/sort_merge_join.cu     |  8 ++--
 9 files changed, 96 insertions(+), 70 deletions(-)

diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu
index dacd1c7a648..98d7c49b20d 100644
--- a/cpp/benchmarks/join/conditional_join.cu
+++ b/cpp/benchmarks/join/conditional_join.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -53,7 +53,8 @@ NVBENCH_BENCH_TYPES(nvbench_conditional_inner_join,
   .set_name("conditional_inner_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", CONDITIONAL_JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", CONDITIONAL_JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", CONDITIONAL_JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_conditional_left_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -62,4 +63,5 @@ NVBENCH_BENCH_TYPES(nvbench_conditional_left_join,
   .set_name("conditional_left_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", CONDITIONAL_JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", CONDITIONAL_JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", CONDITIONAL_JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu
index ea64a26bbbd..6fe8928128c 100644
--- a/cpp/benchmarks/join/distinct_join.cu
+++ b/cpp/benchmarks/join/distinct_join.cu
@@ -53,7 +53,8 @@ NVBENCH_BENCH_TYPES(nvbench_distinct_inner_join,
   .set_name("distinct_inner_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_distinct_left_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -62,4 +63,5 @@ NVBENCH_BENCH_TYPES(nvbench_distinct_left_join,
   .set_name("distinct_left_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
diff --git a/cpp/benchmarks/join/filter_join_indices.cu b/cpp/benchmarks/join/filter_join_indices.cu
index 5f6cb43afdc..1cc66d475c7 100644
--- a/cpp/benchmarks/join/filter_join_indices.cu
+++ b/cpp/benchmarks/join/filter_join_indices.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -131,7 +131,8 @@ NVBENCH_BENCH_TYPES(nvbench_filter_join_indices_inner_join,
   .set_name("filter_join_indices_inner_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_filter_join_indices_inner_join_complex_ast,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -141,7 +142,8 @@ NVBENCH_BENCH_TYPES(nvbench_filter_join_indices_inner_join_complex_ast,
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
   .add_int64_axis("right_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("ast_levels", {1, 5, 10});
+  .add_int64_axis("ast_levels", {1, 5, 10})
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_filter_join_indices_left_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -150,7 +152,8 @@ NVBENCH_BENCH_TYPES(nvbench_filter_join_indices_left_join,
   .set_name("filter_join_indices_left_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_filter_join_indices_full_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -159,4 +162,5 @@ NVBENCH_BENCH_TYPES(nvbench_filter_join_indices_full_join,
   .set_name("filter_join_indices_full_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu
index d5404a99611..88d2cf22dcd 100644
--- a/cpp/benchmarks/join/join.cu
+++ b/cpp/benchmarks/join/join.cu
@@ -91,7 +91,8 @@ NVBENCH_BENCH_TYPES(nvbench_inner_join,
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("num_keys", nvbench::range(1, 5, 1))
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_left_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -101,7 +102,8 @@ NVBENCH_BENCH_TYPES(nvbench_left_join,
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("num_keys", nvbench::range(1, 5, 1))
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_full_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -111,7 +113,8 @@ NVBENCH_BENCH_TYPES(nvbench_full_join,
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("num_keys", nvbench::range(1, 5, 1))
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_inner_join_selectivity,
                     NVBENCH_TYPE_AXES(DEFAULT_JOIN_NULL_EQUALITY, SELECTIVITY_JOIN_DATATYPES))
@@ -120,4 +123,5 @@ NVBENCH_BENCH_TYPES(nvbench_inner_join_selectivity,
   .add_int64_axis("left_size", {100'000'000})
   .add_int64_axis("right_size", {100'000})
   .add_int64_axis("num_probes", {4})
-  .add_float64_axis("selectivity", JOIN_SELECTIVITY_RANGE);
+  .add_float64_axis("selectivity", JOIN_SELECTIVITY_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 807335d0336..aa594a3c5b6 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -60,6 +60,26 @@ inline void create_complex_ast_expression(cudf::ast::tree& tree, cudf::size_type
   }
 }
 
+// Returns true (and marks `state` as skipped) when the `skip_large_sizes` axis is enabled and
+// the build side of this benchmark is larger than the probe side. `build_is_right` selects which
+// side is the build side: true for hash-style benches that preprocess the right table, false for
+// benches like `mark_join` that preprocess the left.
+inline bool should_skip_large_sizes(nvbench::state& state, bool build_is_right = true)
+{
+  if (state.get_int64("skip_large_sizes") == 0) { return false; }
+  auto const left_size  = state.get_int64("left_size");
+  auto const right_size = state.get_int64("right_size");
+  if (build_is_right && right_size > left_size) {
+    state.skip("build (right) should be smaller than probe (left)");
+    return true;
+  }
+  if (!build_is_right && left_size > right_size) {
+    state.skip("build (left) should be smaller than probe (right)");
+    return true;
+  }
+  return false;
+}
+
 template <bool Nullable,
           join_t join_type                  = join_t::HASH,
           cudf::null_equality compare_nulls = cudf::null_equality::UNEQUAL,
@@ -68,18 +88,15 @@ template <bool Nullable,
 void BM_join(state_type& state,
              std::vector<cudf::type_id>& key_types,
              Join JoinFunc,
-             int multiplicity          = 1,
-             double selectivity        = 0.3,
-             bool skip_large_right_tbl = true)
+             int multiplicity    = 1,
+             double selectivity  = 0.3,
+             bool build_is_right = true)
 {
+  if (should_skip_large_sizes(state, build_is_right)) { return; }
+
   auto const right_size = static_cast<size_t>(state.get_int64("right_size"));
   auto const left_size  = static_cast<size_t>(state.get_int64("left_size"));
 
-  if (skip_large_right_tbl && right_size > left_size) {
-    state.skip("Skip large right table");
-    return;
-  }
-
   auto const num_keys             = key_types.size();
   auto const num_payload_cols     = 2;
   auto [build_table, probe_table] = generate_input_tables<Nullable>(
diff --git a/cpp/benchmarks/join/left_join.cu b/cpp/benchmarks/join/left_join.cu
index 37cf4d6ae8c..3f567fc0371 100644
--- a/cpp/benchmarks/join/left_join.cu
+++ b/cpp/benchmarks/join/left_join.cu
@@ -18,18 +18,10 @@ void nvbench_left_anti_join(nvbench::state& state,
                                                nvbench::enum_type<DataType>>)
 {
   auto const num_probes  = static_cast<cudf::size_type>(state.get_int64("num_probes"));
-  auto const left_size   = state.get_int64("left_size");
-  auto const right_size  = state.get_int64("right_size");
   auto const selectivity = state.get_float64("selectivity");
   auto const join_type   = state.get_string("join_type");
-  if (join_type == "mark_join" && left_size > right_size) {
-    state.skip("mark_join: build (left) should be smaller than probe (right)");
-    return;
-  }
-  if (join_type == "filtered_join" && right_size > left_size) {
-    state.skip("filtered_join: build (right) should be smaller than probe (left)");
-    return;
-  }
+  // filtered_join builds on the right side; mark_join builds on the left side.
+  auto const build_is_right = (join_type == "filtered_join");
   auto dtypes = cycle_dtypes(get_type_or_group(static_cast<int32_t>(DataType)), num_keys);
 
   auto join = [num_probes, &join_type](cudf::table_view const& left,
@@ -51,9 +43,8 @@ void nvbench_left_anti_join(nvbench::state& state,
     }
   };
 
-  auto const skip_large_right = (join_type == "filtered_join");
   BM_join<Nullable, join_t::HASH, NullEquality>(
-    state, dtypes, join, 1, selectivity, skip_large_right);
+    state, dtypes, join, 1, selectivity, build_is_right);
 }
 
 template <bool Nullable, cudf::null_equality NullEquality, data_type DataType>
@@ -63,18 +54,10 @@ void nvbench_left_semi_join(nvbench::state& state,
                                                nvbench::enum_type<DataType>>)
 {
   auto const num_probes  = static_cast<cudf::size_type>(state.get_int64("num_probes"));
-  auto const left_size   = state.get_int64("left_size");
-  auto const right_size  = state.get_int64("right_size");
   auto const selectivity = state.get_float64("selectivity");
   auto const join_type   = state.get_string("join_type");
-  if (join_type == "mark_join" && left_size > right_size) {
-    state.skip("mark_join: build (left) should be smaller than probe (right)");
-    return;
-  }
-  if (join_type == "filtered_join" && right_size > left_size) {
-    state.skip("filtered_join: build (right) should be smaller than probe (left)");
-    return;
-  }
+  // filtered_join builds on the right side; mark_join builds on the left side.
+  auto const build_is_right = (join_type == "filtered_join");
   auto dtypes = cycle_dtypes(get_type_or_group(static_cast<int32_t>(DataType)), num_keys);
 
   auto join = [num_probes, &join_type](cudf::table_view const& left,
@@ -95,9 +78,8 @@ void nvbench_left_semi_join(nvbench::state& state,
       return obj.semi_join(left);
     }
   };
-  auto const skip_large_right = (join_type == "filtered_join");
   BM_join<Nullable, join_t::HASH, NullEquality>(
-    state, dtypes, join, 1, selectivity, skip_large_right);
+    state, dtypes, join, 1, selectivity, build_is_right);
 }
 
 template <cudf::null_equality NullEquality, data_type DataType>
@@ -151,12 +133,14 @@ void nvbench_mark_left_semi_join_selectivity(
 {
   auto const num_probes  = static_cast<cudf::size_type>(state.get_int64("num_probes"));
   auto const selectivity = state.get_float64("selectivity");
+  auto const prefilter =
+    state.get_int64("use_prefilter") != 0 ? cudf::join_prefilter::YES : cudf::join_prefilter::NO;
   auto dtypes = cycle_dtypes(get_type_or_group(static_cast<int32_t>(DataType)), num_keys);
 
-  auto join = [num_probes](cudf::table_view const& left,
-                           cudf::table_view const& right,
-                           cudf::null_equality compare_nulls) {
-    cudf::mark_join obj(left, compare_nulls, cudf::join_prefilter::YES, cudf::get_default_stream());
+  auto join = [num_probes, prefilter](cudf::table_view const& left,
+                                      cudf::table_view const& right,
+                                      cudf::null_equality compare_nulls) {
+    cudf::mark_join obj(left, compare_nulls, prefilter, cudf::get_default_stream());
     for (auto i = 0; i < num_probes - 1; i++) {
       [[maybe_unused]] auto result = obj.semi_join(right);
     }
@@ -176,7 +160,8 @@ NVBENCH_BENCH_TYPES(nvbench_left_anti_join,
   .add_int64_axis("right_size", JOIN_SIZE_RANGE)
   .add_int64_axis("num_probes", {4})
   .add_float64_axis("selectivity", {0.3})
-  .add_string_axis("join_type", {"mark_join", "filtered_join"});
+  .add_string_axis("join_type", {"mark_join", "filtered_join"})
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_left_semi_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -188,7 +173,8 @@ NVBENCH_BENCH_TYPES(nvbench_left_semi_join,
   .add_int64_axis("right_size", JOIN_SIZE_RANGE)
   .add_int64_axis("num_probes", {4})
   .add_float64_axis("selectivity", {0.3})
-  .add_string_axis("join_type", {"mark_join", "filtered_join"});
+  .add_string_axis("join_type", {"mark_join", "filtered_join"})
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_filtered_left_anti_join_selectivity,
                     NVBENCH_TYPE_AXES(DEFAULT_JOIN_NULL_EQUALITY, SELECTIVITY_JOIN_DATATYPES))
@@ -197,7 +183,8 @@ NVBENCH_BENCH_TYPES(nvbench_filtered_left_anti_join_selectivity,
   .add_int64_axis("left_size", {100'000'000})
   .add_int64_axis("right_size", {100'000})
   .add_int64_axis("num_probes", {4})
-  .add_float64_axis("selectivity", JOIN_SELECTIVITY_RANGE);
+  .add_float64_axis("selectivity", JOIN_SELECTIVITY_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_filtered_left_semi_join_selectivity,
                     NVBENCH_TYPE_AXES(DEFAULT_JOIN_NULL_EQUALITY, SELECTIVITY_JOIN_DATATYPES))
@@ -206,7 +193,8 @@ NVBENCH_BENCH_TYPES(nvbench_filtered_left_semi_join_selectivity,
   .add_int64_axis("left_size", {100'000'000})
   .add_int64_axis("right_size", {100'000})
   .add_int64_axis("num_probes", {4})
-  .add_float64_axis("selectivity", JOIN_SELECTIVITY_RANGE);
+  .add_float64_axis("selectivity", JOIN_SELECTIVITY_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_mark_left_semi_join_selectivity,
                     NVBENCH_TYPE_AXES(DEFAULT_JOIN_NULL_EQUALITY, SELECTIVITY_JOIN_DATATYPES))
@@ -215,4 +203,6 @@ NVBENCH_BENCH_TYPES(nvbench_mark_left_semi_join_selectivity,
   .add_int64_axis("left_size", {100'000})
   .add_int64_axis("right_size", {100'000'000})
   .add_int64_axis("num_probes", {4})
-  .add_float64_axis("selectivity", JOIN_SELECTIVITY_RANGE);
+  .add_float64_axis("selectivity", JOIN_SELECTIVITY_RANGE)
+  .add_int64_axis("use_prefilter", {1})
+  .add_int64_axis("skip_large_sizes", {1});
diff --git a/cpp/benchmarks/join/mixed_join.cu b/cpp/benchmarks/join/mixed_join.cu
index 1dcc9544101..f05ed2f3b39 100644
--- a/cpp/benchmarks/join/mixed_join.cu
+++ b/cpp/benchmarks/join/mixed_join.cu
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -166,7 +166,8 @@ NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
   .set_name("mixed_inner_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join_complex_ast,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -176,7 +177,8 @@ NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join_complex_ast,
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
   .add_int64_axis("right_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("ast_levels", {1, 5, 10});
+  .add_int64_axis("ast_levels", {1, 5, 10})
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -185,7 +187,8 @@ NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
   .set_name("mixed_left_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -194,7 +197,8 @@ NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
   .set_name("mixed_full_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -203,7 +207,8 @@ NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
   .set_name("mixed_left_semi_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -212,4 +217,5 @@ NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
   .set_name("mixed_left_anti_join")
   .set_type_axes_names({"Nullable", "NullEquality", "DataType"})
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("skip_large_sizes", {1});
diff --git a/cpp/benchmarks/join/multiplicity_join.cu b/cpp/benchmarks/join/multiplicity_join.cu
index a1aff0e1ef3..e9d8878db06 100644
--- a/cpp/benchmarks/join/multiplicity_join.cu
+++ b/cpp/benchmarks/join/multiplicity_join.cu
@@ -115,7 +115,8 @@ NVBENCH_BENCH_TYPES(nvbench_hm_inner_join,
   .add_int64_axis("num_keys", nvbench::range(1, 3, 1))
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
   .add_int64_axis("right_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("multiplicity", {100, 1'000, 10'000, 50'000});
+  .add_int64_axis("multiplicity", {100, 1'000, 10'000, 50'000})
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_hm_left_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -127,7 +128,8 @@ NVBENCH_BENCH_TYPES(nvbench_hm_left_join,
   .add_int64_axis("num_keys", nvbench::range(1, 3, 1))
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
   .add_int64_axis("right_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("multiplicity", {100, 1'000, 10'000, 50'000});
+  .add_int64_axis("multiplicity", {100, 1'000, 10'000, 50'000})
+  .add_int64_axis("skip_large_sizes", {1});
 
 NVBENCH_BENCH_TYPES(nvbench_hm_full_join,
                     NVBENCH_TYPE_AXES(JOIN_NULLABLE_RANGE,
@@ -139,4 +141,5 @@ NVBENCH_BENCH_TYPES(nvbench_hm_full_join,
   .add_int64_axis("num_keys", nvbench::range(1, 3, 1))
   .add_int64_axis("left_size", JOIN_SIZE_RANGE)
   .add_int64_axis("right_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("multiplicity", {100, 1'000, 10'000, 50'000});
+  .add_int64_axis("multiplicity", {100, 1'000, 10'000, 50'000})
+  .add_int64_axis("skip_large_sizes", {1});
diff --git a/cpp/benchmarks/join/sort_merge_join.cu b/cpp/benchmarks/join/sort_merge_join.cu
index 0dcdc21791d..461960e1c2a 100644
--- a/cpp/benchmarks/join/sort_merge_join.cu
+++ b/cpp/benchmarks/join/sort_merge_join.cu
@@ -40,10 +40,7 @@ void nvbench_sort_merge_inner_join(nvbench::state& state,
   auto const multiplicity = 1;
   auto const selectivity  = 0.3;
 
-  if (right_size > left_size) {
-    state.skip("Skip large right table");
-    return;
-  }
+  if (should_skip_large_sizes(state)) { return; }
 
   auto dtypes = cycle_dtypes(get_type_or_group(static_cast<int32_t>(DataType)), num_keys);
 
@@ -116,4 +113,5 @@ NVBENCH_BENCH_TYPES(nvbench_sort_merge_inner_join,
   .add_int64_axis("num_keys", nvbench::range(1, 3, 1))
   .add_int64_axis("left_size", {10'000, 100'000})
   .add_int64_axis("right_size", {10'000, 100'000})
-  .add_int64_axis("use_key_remap", {0, 1});
+  .add_int64_axis("use_key_remap", {0, 1})
+  .add_int64_axis("skip_large_sizes", {1});

From c2f583c7a4ec2503ac41ebbc75a98cb3e89c58b9 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 5 May 2026 05:50:21 -0500
Subject: [PATCH 09/36] Expose __from_arrow__ on masked extension dtype proxies
 (#22373)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Pandas' `BaseMaskedDtype` defines `__from_arrow__` for converting a
`pyarrow.Array` (including `NullArray`/`ChunkedArray` of nulls) into the
matching `BaseMaskedArray`. The cudf.pandas final proxy types for
`BooleanDtype`, `Int{8,16,32,64}Dtype`, `UInt{8,16,32,64}Dtype`, and
`Float{32,64}Dtype` did not list `__from_arrow__` in their
`additional_attributes`, so the proxy `__getattr__` raised
`AttributeError` even though the slow object has it.

## Change

Add `"__from_arrow__": _FastSlowAttribute("__from_arrow__")` to all
eleven masked dtype proxy declarations in
`python/cudf/cudf/pandas/_wrappers/pandas.py`, mirroring the existing
pattern on `ArrowDtype`.

## Tests / Conftest

Removes 25 entries from `conftest-patch.py` that were xfailed only
because of the missing attribute:

- 22 parametrizations of
`tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null` (all
four masked dtype families × two arrow array shapes).
-
`tests/arrays/masked/test_arrow_compat.py::test_arrow_from_arrow_uint`.
-
`tests/arrays/masked/test_arrow_compat.py::test_dataframe_from_arrow_types_mapper`.
-
`tests/indexes/multi/test_constructors.py::test_from_frame_missing_values_multiIndex`.

All 22 `test_from_arrow_null` cases pass, the full
`test_arrow_compat.py` file passes (69 passed, 22 unrelated xfails), and
the cudf-side `cudf_pandas_tests/` suite is clean (435 passed).

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 11 ++++++++
 .../cudf/pandas/scripts/conftest-patch.py     | 25 -------------------
 2 files changed, 11 insertions(+), 25 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 006473953cd..0bdc05205f5 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -882,6 +882,7 @@ def Index__setattr__(self, name, value):
     slow_to_fast=_Unusable(),
     bases=(pd.api.extensions.ExtensionDtype,),
     additional_attributes={
+        "__from_arrow__": _FastSlowAttribute("__from_arrow__"),
         "__hash__": _FastSlowAttribute("__hash__"),
     },
 )
@@ -908,6 +909,7 @@ def Index__setattr__(self, name, value):
     slow_to_fast=_Unusable(),
     bases=(pd.api.extensions.ExtensionDtype,),
     additional_attributes={
+        "__from_arrow__": _FastSlowAttribute("__from_arrow__"),
         "__hash__": _FastSlowAttribute("__hash__"),
     },
 )
@@ -921,6 +923,7 @@ def Index__setattr__(self, name, value):
     slow_to_fast=_Unusable(),
     bases=(pd.api.extensions.ExtensionDtype,),
     additional_attributes={
+        "__from_arrow__": _FastSlowAttribute("__from_arrow__"),
         "__hash__": _FastSlowAttribute("__hash__"),
     },
 )
@@ -933,6 +936,7 @@ def Index__setattr__(self, name, value):
     slow_to_fast=_Unusable(),
     bases=(pd.api.extensions.ExtensionDtype,),
     additional_attributes={
+        "__from_arrow__": _FastSlowAttribute("__from_arrow__"),
         "__hash__": _FastSlowAttribute("__hash__"),
     },
 )
@@ -945,6 +949,7 @@ def Index__setattr__(self, name, value):
     slow_to_fast=_Unusable(),
     bases=(pd.api.extensions.ExtensionDtype,),
     additional_attributes={
+        "__from_arrow__": _FastSlowAttribute("__from_arrow__"),
         "__hash__": _FastSlowAttribute("__hash__"),
     },
 )
@@ -957,6 +962,7 @@ def Index__setattr__(self, name, value):
     slow_to_fast=_Unusable(),
     bases=(pd.api.extensions.ExtensionDtype,),
     additional_attributes={
+        "__from_arrow__": _FastSlowAttribute("__from_arrow__"),
         "__hash__": _FastSlowAttribute("__hash__"),
     },
 )
@@ -969,6 +975,7 @@ def Index__setattr__(self, name, value):
     slow_to_fast=_Unusable(),
     bases=(pd.api.extensions.ExtensionDtype,),
     additional_attributes={
+        "__from_arrow__": _FastSlowAttribute("__from_arrow__"),
         "__hash__": _FastSlowAttribute("__hash__"),
     },
 )
@@ -981,6 +988,7 @@ def Index__setattr__(self, name, value):
     slow_to_fast=_Unusable(),
     bases=(pd.api.extensions.ExtensionDtype,),
     additional_attributes={
+        "__from_arrow__": _FastSlowAttribute("__from_arrow__"),
         "__hash__": _FastSlowAttribute("__hash__"),
     },
 )
@@ -993,6 +1001,7 @@ def Index__setattr__(self, name, value):
     slow_to_fast=_Unusable(),
     bases=(pd.api.extensions.ExtensionDtype,),
     additional_attributes={
+        "__from_arrow__": _FastSlowAttribute("__from_arrow__"),
         "__hash__": _FastSlowAttribute("__hash__"),
     },
 )
@@ -1074,6 +1083,7 @@ def Index__setattr__(self, name, value):
     slow_to_fast=_Unusable(),
     bases=(pd.api.extensions.ExtensionDtype,),
     additional_attributes={
+        "__from_arrow__": _FastSlowAttribute("__from_arrow__"),
         "__hash__": _FastSlowAttribute("__hash__"),
     },
 )
@@ -1086,6 +1096,7 @@ def Index__setattr__(self, name, value):
     slow_to_fast=_Unusable(),
     bases=(pd.api.extensions.ExtensionDtype,),
     additional_attributes={
+        "__from_arrow__": _FastSlowAttribute("__from_arrow__"),
         "__hash__": _FastSlowAttribute("__hash__"),
     },
 )
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 8e65569d557..83202033fa0 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -537,30 +537,6 @@ def pytest_unconfigure(config):
     "tests/arrays/masked/test_arrow_compat.py::test_arrow_array[UInt64]": "TODO: Add a reason for failure",
     "tests/arrays/masked/test_arrow_compat.py::test_arrow_array[UInt8]": "TODO: Add a reason for failure",
     "tests/arrays/masked/test_arrow_compat.py::test_arrow_array[boolean]": "TODO: Add a reason for failure",
-    "tests/arrays/masked/test_arrow_compat.py::test_arrow_from_arrow_uint": "AttributeError: 'UInt32Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_dataframe_from_arrow_types_mapper": "ValueError: This column does not support to be converted to a pandas ExtensionArray",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Float32-arr0]": "AttributeError: 'Float32Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Float32-arr1]": "AttributeError: 'Float32Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Float64-arr0]": "AttributeError: 'Float64Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Float64-arr1]": "AttributeError: 'Float64Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Int16-arr0]": "AttributeError: 'Int16Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Int16-arr1]": "AttributeError: 'Int16Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Int32-arr0]": "AttributeError: 'Int32Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Int32-arr1]": "AttributeError: 'Int32Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Int64-arr0]": "AttributeError: 'Int64Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Int64-arr1]": "AttributeError: 'Int64Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Int8-arr0]": "AttributeError: 'Int8Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[Int8-arr1]": "AttributeError: 'Int8Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[UInt16-arr0]": "AttributeError: 'UInt16Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[UInt16-arr1]": "AttributeError: 'UInt16Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[UInt32-arr0]": "AttributeError: 'UInt32Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[UInt32-arr1]": "AttributeError: 'UInt32Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[UInt64-arr0]": "AttributeError: 'UInt64Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[UInt64-arr1]": "AttributeError: 'UInt64Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[UInt8-arr0]": "AttributeError: 'UInt8Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[UInt8-arr1]": "AttributeError: 'UInt8Dtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[boolean-arr0]": "AttributeError: 'BooleanDtype' object has no attribute '__from_arrow__'",
-    "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_null[boolean-arr1]": "AttributeError: 'BooleanDtype' object has no attribute '__from_arrow__'",
     "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_type_error[Float32]": "TODO: Add a reason for failure",
     "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_type_error[Float64]": "TODO: Add a reason for failure",
     "tests/arrays/masked/test_arrow_compat.py::test_from_arrow_type_error[Int16]": "TODO: Add a reason for failure",
@@ -4310,7 +4286,6 @@ def pytest_unconfigure(config):
     "tests/indexes/multi/test_constructors.py::test_create_index_existing_name": "TODO: Add a reason for failure",
     "tests/indexes/multi/test_constructors.py::test_from_arrays_respects_none_names": "TODO: Add a reason for failure",
     "tests/indexes/multi/test_constructors.py::test_from_frame_dtype_fidelity": "TODO: Add a reason for failure",
-    "tests/indexes/multi/test_constructors.py::test_from_frame_missing_values_multiIndex": "AttributeError: 'Float64Dtype' object has no attribute '__from_arrow__'",
     "tests/indexes/multi/test_constructors.py::test_multiindex_inference_consistency": "TODO: Add a reason for failure",
     "tests/indexes/multi/test_conversion.py::test_to_frame_column_rangeindex": "AssertionError: Index are different",
     "tests/indexes/multi/test_conversion.py::test_to_frame_dtype_fidelity": "TODO: Add a reason for failure",

From 5c4c21abd4a5786d9d4b1aeab15b0fc24841b7ea Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 5 May 2026 05:53:49 -0500
Subject: [PATCH 10/36] Fix datetime resolution for empty data inputs (#22363)

## Description
In pandas empty datetime inputs default to `s` resolution, this PR fixes
that inconsistency and matches `cudf` with `pandas`. This PR also fixes
`freq` preservation in `Groupby.size`

## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.
---
 python/cudf/cudf/core/index.py                |  29 ++-
 python/cudf/cudf/core/resample.py             |  44 ++---
 python/cudf/cudf/core/tools/datetimes.py      |  55 +++---
 .../cudf/pandas/scripts/conftest-patch.py     | 170 ------------------
 .../general_functions/test_to_datetime.py     |  99 ++++++++++
 .../cudf/cudf/tests/groupby/test_resample.py  |  75 ++++++--
 .../datetimeindex/methods/test_to_pandas.py   |  27 +++
 .../indexes/datetimeindex/test_constructor.py |  43 ++++-
 8 files changed, 307 insertions(+), 235 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 41be251acce..54ffdf33352 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3206,7 +3206,15 @@ def __init__(
                 data = data.astype(dtype)
         elif data.dtype.kind != "M":
             if is_dtype_obj_string(data.dtype):
-                data = data.astype(np.dtype("datetime64[us]"))
+                # Pandas's array_to_datetime falls back to [s] when no
+                # concrete (non-NaT) datetime is observed — empty input or
+                # an all-NaT/None array (pandas-dev/pandas#55901). Otherwise
+                # parsed strings land on [us].
+                if len(data) == 0 or data.null_count == len(data):
+                    target_unit = "s"
+                else:
+                    target_unit = "us"
+                data = data.astype(np.dtype(f"datetime64[{target_unit}]"))
             else:
                 data = data.astype(np.dtype("datetime64[ns]"))
 
@@ -4030,11 +4038,20 @@ def to_pandas(
     ) -> pd.DatetimeIndex:
         result = super().to_pandas(nullable=nullable, arrow_type=arrow_type)
         if not arrow_type and self._freq is not None:
-            # Re-infer from the result's values rather than trusting the cached
-            # self._freq, which (e.g. via deserialization or external assignment)
-            # may not conform. Pandas validates on assignment and raises when
-            # values don't match, so inferring keeps the proxy round-trip robust.
-            result.freq = result.inferred_freq
+            # Prefer pandas's inferred_freq because the cached self._freq may
+            # not conform (e.g. after deserialization or external assignment)
+            # and pandas validates the assignment against the index values.
+            # Fall back to the cached freq when inference is impossible
+            # (empty / single-element indexes), so resample round-trips
+            # preserve `freq` to match pandas.
+            inferred = result.inferred_freq
+            if inferred is None:
+                try:
+                    result.freq = self._freq._maybe_as_fast_pandas_offset()
+                except ValueError:
+                    pass
+            else:
+                result.freq = inferred
         return result
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index 3008ba19d08..9bef8000e2b 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -4,7 +4,6 @@
 
 from typing import TYPE_CHECKING
 
-import numpy as np
 import pandas as pd
 
 import cudf
@@ -70,6 +69,25 @@ def asfreq(self):
             )
         )
 
+    def size(self):
+        # GroupBy.size bypasses _Resampler.agg and so doesn't pick up the
+        # bin-label freq. Re-align to the full set of bins (filling empty
+        # buckets with 0, since size is non-null in pandas) and re-attach
+        # the freq.
+        result = super().size()
+        if len(self.grouping.bin_labels) != len(result):
+            index = cudf.Index(
+                self.grouping.bin_labels, name=self.grouping.names[0]
+            )
+            result = (
+                result._align_to_index(
+                    index, how="right", sort=False, allow_non_unique=True
+                )
+                .fillna(0)
+                .astype(result.dtype)
+            )
+        return self._restore_freq(result.sort_index())
+
     def _scan_fill(
         self, method: plc.replace.ReplacePolicy, limit: int | None
     ) -> DataFrameOrSeries:
@@ -260,26 +278,12 @@ def _handle_frequency_grouper(self, by):
             freq=freq,
         )
 
-        # We want the (resampled) column of timestamps in the result
-        # to have a resolution closest to the resampling
-        # frequency. For example, if resampling from '1T' to '1s', we
-        # want the resulting timestamp column to by of dtype
-        # 'datetime64[s]'.  libcudf requires the bin labels and key
-        # column to have the same dtype, so we compute a `result_type`
+        # Pandas resample preserves the input column's resolution, so the
+        # resulting timestamp column should match `key_column.dtype` rather
+        # than be derived from the offset. libcudf requires the bin labels
+        # and key column to share a dtype, so we compute a `result_type`
         # and cast them both to that type.
-        if offset.rule_code.lower() in {"d", "h"}:
-            # unsupported resolution (we don't support resolutions >s)
-            result_type = np.dtype("datetime64[s]")
-        else:
-            try:
-                result_type = np.dtype(f"datetime64[{offset.rule_code}]")
-                # TODO: Ideally, we can avoid one cast by having `date_range`
-                # generate timestamps of a given dtype.  Currently, it can
-                # only generate timestamps with 'ns' precision
-            except TypeError:
-                # unsupported resolution (we don't support resolutions >s)
-                # fall back to using datetime64[s]
-                result_type = np.dtype("datetime64[s]")
+        result_type = key_column.dtype
         cast_key_column = key_column.astype(result_type)
         cast_bin_labels = bin_labels.astype(result_type)
 
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index e56c29e68d4..c83610287df 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -133,7 +133,7 @@ def to_datetime(
     >>> cudf.to_datetime(df)
     0   2015-02-04
     1   2016-03-05
-    dtype: datetime64[s]
+    dtype: datetime64[us]
     >>> cudf.to_datetime(1490195805, unit='s')
     Timestamp('2017-03-22 15:16:45')
     >>> cudf.to_datetime(1490195805433502912, unit='ns')
@@ -216,32 +216,31 @@ def to_datetime(
                 .str.zfill(2)
             )
             format = "%Y-%m-%d"
+            target_unit = "us"
             for u in ["h", "m", "s", "ms", "us", "ns"]:
                 value = unit_rev.get(u)
                 if value is not None and value in arg:
                     arg_col = arg._data[value]
                     if arg_col.dtype.kind == "f":
-                        col = new_series._column.strptime(
-                            np.dtype("datetime64[ns]"), format=format
-                        )
+                        target_unit = "ns"
                         break
                     elif arg_col.dtype.kind == "O":
                         string_col = cast("StringColumn", arg_col)
                         if not string_col.is_all_integer():
-                            col = new_series._column.strptime(
-                                np.dtype("datetime64[ns]"), format=format
-                            )
+                            target_unit = "ns"
                             break
-            else:
-                col = new_series._column.strptime(
-                    np.dtype("datetime64[s]"), format=format
-                )
+                    elif u == "ns":
+                        # An explicit nanosecond field forces ns precision
+                        # (pandas widens to [ns] when ns is present).
+                        target_unit = "ns"
+            col = new_series._column.strptime(
+                np.dtype(f"datetime64[{target_unit}]"), format=format
+            )
 
             times_column = None
-            factor_denominator = (
-                unit_to_nanoseconds_conversion["s"]
-                if np.datetime_data(col.dtype)[0] == "s"
-                else 1
+            col_unit = np.datetime_data(col.dtype)[0]
+            factor_denominator = unit_to_nanoseconds_conversion.get(
+                col_unit, 1
             )
             for u in ["h", "m", "s", "ms", "us", "ns"]:
                 value = unit_rev.get(u)
@@ -259,9 +258,12 @@ def to_datetime(
                                 np.dtype(np.float64)
                             )
 
-                    factor = (
-                        unit_to_nanoseconds_conversion[u] / factor_denominator
-                    )
+                    factor_numerator = unit_to_nanoseconds_conversion[u]
+                    factor: int | float
+                    if factor_numerator % factor_denominator == 0:
+                        factor = factor_numerator // factor_denominator
+                    else:
+                        factor = factor_numerator / factor_denominator
 
                     if times_column is None:
                         times_column = current_col * factor
@@ -280,9 +282,6 @@ def to_datetime(
             )
             return Series._from_column(col, index=arg.index)
         else:
-            if unit is None and is_scalar(arg):
-                unit = "ns"
-
             col = _process_col(
                 col=as_column(arg),
                 unit=unit,
@@ -333,13 +332,13 @@ def _process_col(
             # int column out of it to parse against `format`.
             # Instead we directly cast to int and perform
             # parsing against `format`.
+            # Pandas 3 defaults parsed datetimes to `datetime64[us]`
+            # regardless of format precision.
             col = (
                 col.astype(np.dtype(np.int64))
                 .astype(DEFAULT_STRING_DTYPE)
                 .strptime(
-                    dtype=np.dtype("datetime64[us]")
-                    if "%f" in format
-                    else np.dtype("datetime64[s]"),
+                    dtype=np.dtype("datetime64[us]"),
                     format=format,
                 )
             )
@@ -373,7 +372,13 @@ def _process_col(
             )
 
     elif col.dtype.kind == "O":
-        if unit not in (None, "ns") or col.is_all_null:
+        if col.is_all_null:
+            # Pandas converts all-null inputs to NaT at second precision
+            # regardless of `unit`/`format`; mirror that here without
+            # routing through the int/float path (which would land on
+            # the [ns]/[us] defaults).
+            return col.astype(np.dtype("datetime64[s]"))
+        if unit not in (None, "ns"):
             try:
                 col = col.astype(np.dtype(np.int64))
             except ValueError:
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 83202033fa0..373e0bf1670 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -212,50 +212,34 @@ def pytest_unconfigure(config):
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_dti_cmp_str['US/Eastern']": "AssertionError: numpy array are different",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'Asia/Tokyo'-DataFrame-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'Asia/Tokyo'-DataFrame-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'Asia/Tokyo'-Index-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'Asia/Tokyo'-Index-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'Asia/Tokyo'-Series-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'Asia/Tokyo'-Series-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'US/Eastern'-DataFrame-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'US/Eastern'-DataFrame-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'US/Eastern'-Index-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'US/Eastern'-Index-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'US/Eastern'-Series-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[ge-'US/Eastern'-Series-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'Asia/Tokyo'-DataFrame-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'Asia/Tokyo'-DataFrame-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'Asia/Tokyo'-Index-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'Asia/Tokyo'-Index-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'Asia/Tokyo'-Series-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'Asia/Tokyo'-Series-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'US/Eastern'-DataFrame-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'US/Eastern'-DataFrame-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'US/Eastern'-Index-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'US/Eastern'-Index-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'US/Eastern'-Series-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[gt-'US/Eastern'-Series-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'Asia/Tokyo'-DataFrame-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'Asia/Tokyo'-DataFrame-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'Asia/Tokyo'-Index-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'Asia/Tokyo'-Index-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'Asia/Tokyo'-Series-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'Asia/Tokyo'-Series-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'US/Eastern'-DataFrame-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'US/Eastern'-DataFrame-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'US/Eastern'-Index-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'US/Eastern'-Index-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'US/Eastern'-Series-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[le-'US/Eastern'-Series-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'Asia/Tokyo'-DataFrame-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'Asia/Tokyo'-DataFrame-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'Asia/Tokyo'-Index-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'Asia/Tokyo'-Index-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'Asia/Tokyo'-Series-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'Asia/Tokyo'-Series-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'US/Eastern'-DataFrame-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'US/Eastern'-DataFrame-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'US/Eastern'-Index-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'US/Eastern'-Index-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'US/Eastern'-Series-other0]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestDatetimeIndexComparisons::test_scalar_comparison_tzawareness[lt-'US/Eastern'-Series-other2]": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/arithmetic/test_datetime64.py::TestTimestampSeriesArithmetic::test_operators_datetimelike_with_timezones": "AssertionError: Attributes of Series are different",
@@ -1128,7 +1112,6 @@ def pytest_unconfigure(config):
     "tests/copy_view/test_methods.py::test_transpose": "AssertionError: assert False",
     "tests/copy_view/test_methods.py::test_truncate[kwargs0]": "TODO: Add a reason for failure",
     "tests/copy_view/test_methods.py::test_truncate[kwargs1]": "TODO: Add a reason for failure",
-    "tests/copy_view/test_methods.py::test_tz_convert_localize[tz_convert-Europe/Berlin]": "assert False",
     "tests/copy_view/test_methods.py::test_tz_convert_localize[tz_localize-None]": "assert False",
     "tests/copy_view/test_methods.py::test_update_chained_assignment": "TODO: Add a reason for failure",
     "tests/copy_view/test_methods.py::test_update_frame": "TODO: Add a reason for failure",
@@ -2475,7 +2458,6 @@ def pytest_unconfigure(config):
     "tests/frame/methods/test_to_numpy.py::TestToNumpy::test_to_numpy_copy": "TODO: Add a reason for failure",
     "tests/frame/methods/test_to_numpy.py::TestToNumpy::test_to_numpy_mixed_dtype_to_str": "TODO: Add a reason for failure",
     "tests/frame/methods/test_to_records.py::TestDataFrameToRecords::test_to_records_dt64tz_column": "TODO: Add a reason for failure",
-    "tests/frame/methods/test_to_timestamp.py::TestToTimestamp::test_to_timestamp_columns": "AssertionError: Index are different",
     "tests/frame/methods/test_transpose.py::TestTranspose::test_transpose_get_view_dt64tzget_view": "assert 3 == 1",
     "tests/frame/methods/test_truncate.py::TestDataFrameTruncate::test_truncate_multiindex[DataFrame]": "TODO: Add a reason for failure",
     "tests/frame/methods/test_value_counts.py::test_data_frame_value_counts_dropna_false[NAType]": "AssertionError: Series.index level [0] are different",
@@ -3048,7 +3030,6 @@ def pytest_unconfigure(config):
     "tests/frame/test_ufunc.py::test_binary_input_aligns_index[int64]": "TODO: Add a reason for failure",
     "tests/frame/test_unary.py::TestDataFrameUnaryOperators::test_pos_object_raises": "Failed: DID NOT RAISE <class 'TypeError'>",
     "tests/frame/test_unary.py::TestDataFrameUnaryOperators::test_pos_raises": "Failed: DID NOT RAISE <class 'TypeError'>",
-    "tests/generic/test_finalize.py::test_finalize_called[to_period]": "ValueError: You must pass a freq argument as current index has none.",
     "tests/generic/test_frame.py::TestDataFrame::test_metadata_propagation_indiv": "TODO: Add a reason for failure",
     "tests/generic/test_generic.py::TestGeneric::test_stdlib_copy_shallow_copies[DataFrame]": "assert False",
     "tests/generic/test_generic.py::TestGeneric::test_stdlib_copy_shallow_copies[Series]": "assert False",
@@ -3446,8 +3427,6 @@ def pytest_unconfigure(config):
     "tests/groupby/test_counting.py::test_count": "AssertionError: DataFrame are different",
     "tests/groupby/test_counting.py::test_count_arrow_string_array[string=string[pyarrow]]": "TODO: Add a reason for failure",
     "tests/groupby/test_counting.py::test_count_arrow_string_array[string=string[python]]": "TODO: Add a reason for failure",
-    "tests/groupby/test_cumulative.py::test_cummax_datetime": "AssertionError: Attributes of Series are different",
-    "tests/groupby/test_cumulative.py::test_cummin_datetime": "AssertionError: Attributes of Series are different",
     "tests/groupby/test_cumulative.py::test_cython_api2[False]": "AssertionError: DataFrame are different",
     "tests/groupby/test_cumulative.py::test_groupby_cumprod_nan_influences_other_columns": "TODO: Add a reason for failure",
     "tests/groupby/test_cumulative.py::test_numpy_compat[cumprod]": "TODO: Add a reason for failure",
@@ -3456,7 +3435,6 @@ def pytest_unconfigure(config):
     "tests/groupby/test_groupby.py::test_dont_clobber_name_column": "AssertionError: DataFrame are different",
     "tests/groupby/test_groupby.py::test_group_name_available_in_inference_pass": "TODO: Add a reason for failure",
     "tests/groupby/test_groupby.py::test_group_on_two_row_multiindex_returns_one_tuple_key": "TODO: Add a reason for failure",
-    "tests/groupby/test_groupby.py::test_groupby_agg_ohlc_non_first": "AssertionError: (<Day>, None)",
     "tests/groupby/test_groupby.py::test_groupby_all_nan_groups_drop": "TODO: Add a reason for failure",
     "tests/groupby/test_groupby.py::test_groupby_column_index_name_lost[shift]": "TODO: Add a reason for failure",
     "tests/groupby/test_groupby.py::test_groupby_column_index_name_lost[sum]": "TODO: Add a reason for failure",
@@ -4585,7 +4563,6 @@ def pytest_unconfigure(config):
     "tests/indexes/test_datetimelike.py::TestDatetimeLike::test_argsort_matches_array[simple_index1]": "TODO: Add a reason for failure",
     "tests/indexes/test_datetimelike.py::TestDatetimeLike::test_argsort_matches_array[simple_index2]": "TODO: Add a reason for failure",
     "tests/indexes/test_index_new.py::TestIndexConstructorInference::test_constructor_datetimes_mixed_tzs": "AssertionError: Index are different",
-    "tests/indexes/test_index_new.py::TestIndexConstructorInference::test_infer_nat[nan1]": "AssertionError: assert (dtype('<M8[us]') == 'datetime64[s]')",
     "tests/indexes/test_indexing.py::TestGetIndexer::test_get_indexer_base[multi]": "TODO: Add a reason for failure",
     "tests/indexes/test_indexing.py::TestGetIndexer::test_get_indexer_base[tuples]": "TODO: Add a reason for failure",
     "tests/indexes/test_indexing.py::TestTake::test_take_indexer_type": "TODO: Add a reason for failure",
@@ -4852,7 +4829,6 @@ def pytest_unconfigure(config):
     "tests/indexing/multiindex/test_loc.py::TestMultiIndexLoc::test_loc_multiindex_indexer_none": "TODO: Add a reason for failure",
     "tests/indexing/multiindex/test_loc.py::TestMultiIndexLoc::test_loc_multiindex_missing_label_raises": "TODO: Add a reason for failure",
     "tests/indexing/multiindex/test_loc.py::TestMultiIndexLoc::test_loc_no_second_level_index": "TODO: Add a reason for failure",
-    "tests/indexing/multiindex/test_loc.py::TestMultiIndexLoc::test_multiindex_setitem_axis_set": "AssertionError: DataFrame.iloc[:, 0] (column name='A') are different",
     "tests/indexing/multiindex/test_loc.py::test_getitem_str_slice": "TODO: Add a reason for failure",
     "tests/indexing/multiindex/test_loc.py::test_loc_get_scalar_casting_to_float": "TODO: Add a reason for failure",
     "tests/indexing/multiindex/test_loc.py::test_loc_getitem_drops_levels_for_one_row_dataframe": "TODO: Add a reason for failure",
@@ -5267,7 +5243,6 @@ def pytest_unconfigure(config):
     "tests/io/json/test_json_table_schema.py::TestTableOrientReader::test_read_json_table_timezones_orient[vals0-idx0]": "TODO: Add a reason for failure",
     "tests/io/json/test_json_table_schema.py::TestTableOrientReader::test_read_json_table_timezones_orient[vals0-idx1]": "TODO: Add a reason for failure",
     "tests/io/json/test_json_table_schema.py::TestTableOrientReader::test_read_json_table_timezones_orient[vals0-idx2]": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='floats') are different",
-    "tests/io/json/test_json_table_schema.py::TestTableOrientReader::test_read_json_table_timezones_orient[vals0-idx3]": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='floats') are different",
     "tests/io/json/test_pandas.py::TestPandasContainer::test_date_index_and_values[date-False-epoch]": "TODO: Add a reason for failure",
     "tests/io/json/test_pandas.py::TestPandasContainer::test_date_index_and_values[date-False-iso]": "TODO: Add a reason for failure",
     "tests/io/json/test_pandas.py::TestPandasContainer::test_frame_empty": "assert not True",
@@ -5781,62 +5756,8 @@ def pytest_unconfigure(config):
     "tests/resample/test_base.py::test_first_last_skipna[uint8[pyarrow]-False-last]": "TODO: Add a reason for failure",
     "tests/resample/test_base.py::test_first_last_skipna[uint8[pyarrow]-True-first]": "TODO: Add a reason for failure",
     "tests/resample/test_base.py::test_first_last_skipna[uint8[pyarrow]-True-last]": "TODO: Add a reason for failure",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[asfreq-D-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[asfreq-h-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[bfill-D-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[bfill-h-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[ffill-D-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[ffill-h-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[interpolate-D-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[interpolate-h-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[mean-D-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[mean-h-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[nearest-D-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_apply_empty_dataframe[nearest-h-index0]": "AssertionError: DataFrame.index are different",
     "tests/resample/test_base.py::test_resample_count_empty_dataframe[D-index0]": "AssertionError: DataFrame.index are different",
     "tests/resample/test_base.py::test_resample_count_empty_dataframe[h-index0]": "AssertionError: DataFrame.index are different",
-    "tests/resample/test_base.py::test_resample_count_empty_series[count-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_count_empty_series[count-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_count_empty_series[size-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_count_empty_series[size-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[count-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[count-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[first-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[first-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[last-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[last-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[max-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[max-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[mean-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[mean-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[median-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[median-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[min-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[min-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[nunique-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[nunique-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[prod-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[prod-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[quantile-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[quantile-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[size-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[size-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[std-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[std-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[sum-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[sum-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[var-D-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_series[var-h-index1]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_empty_sum_string[string=str[pyarrow]-0]": "AssertionError: (<20 * Seconds>, None)",
-    "tests/resample/test_base.py::test_resample_empty_sum_string[string=str[pyarrow]-1]": "AssertionError: (<20 * Seconds>, None)",
-    "tests/resample/test_base.py::test_resample_empty_sum_string[string=str[python]-0]": "AssertionError: (<20 * Seconds>, None)",
-    "tests/resample/test_base.py::test_resample_empty_sum_string[string=str[python]-1]": "AssertionError: (<20 * Seconds>, None)",
-    "tests/resample/test_base.py::test_resample_empty_sum_string[string=string[pyarrow]-0]": "AssertionError: (<20 * Seconds>, None)",
-    "tests/resample/test_base.py::test_resample_empty_sum_string[string=string[pyarrow]-1]": "AssertionError: (<20 * Seconds>, None)",
-    "tests/resample/test_base.py::test_resample_empty_sum_string[string=string[python]-0]": "AssertionError: (<20 * Seconds>, None)",
-    "tests/resample/test_base.py::test_resample_empty_sum_string[string=string[python]-1]": "AssertionError: (<20 * Seconds>, None)",
-    "tests/resample/test_base.py::test_resample_size_empty_dataframe[D-index0]": "AssertionError: Series.index are different",
-    "tests/resample/test_base.py::test_resample_size_empty_dataframe[h-index0]": "AssertionError: Series.index are different",
     "tests/resample/test_datetime_index.py::test_resample_anchored_intraday2[ms]": "TODO: Add a reason for failure",
     "tests/resample/test_datetime_index.py::test_resample_anchored_intraday2[ns]": "TODO: Add a reason for failure",
     "tests/resample/test_datetime_index.py::test_resample_anchored_intraday2[s]": "TODO: Add a reason for failure",
@@ -5874,37 +5795,15 @@ def pytest_unconfigure(config):
     "tests/resample/test_datetime_index.py::test_resample_origin_epoch_with_tz_day_vs_24h[ns]": "TODO: Add a reason for failure",
     "tests/resample/test_datetime_index.py::test_resample_origin_epoch_with_tz_day_vs_24h[s]": "TODO: Add a reason for failure",
     "tests/resample/test_datetime_index.py::test_resample_origin_epoch_with_tz_day_vs_24h[us]": "TODO: Add a reason for failure",
-    "tests/resample/test_datetime_index.py::test_resample_rounding[ms]": "TODO: Add a reason for failure",
-    "tests/resample/test_datetime_index.py::test_resample_rounding[ns]": "TODO: Add a reason for failure",
-    "tests/resample/test_datetime_index.py::test_resample_rounding[us]": "TODO: Add a reason for failure",
-    "tests/resample/test_datetime_index.py::test_resample_size[ms]": "TODO: Add a reason for failure",
-    "tests/resample/test_datetime_index.py::test_resample_size[ns]": "TODO: Add a reason for failure",
-    "tests/resample/test_datetime_index.py::test_resample_size[s]": "TODO: Add a reason for failure",
-    "tests/resample/test_datetime_index.py::test_resample_size[us]": "TODO: Add a reason for failure",
-    "tests/resample/test_datetime_index.py::test_resample_upsampling_picked_but_not_correct[ms]": "TODO: Add a reason for failure",
-    "tests/resample/test_datetime_index.py::test_resample_upsampling_picked_but_not_correct[ns]": "TODO: Add a reason for failure",
-    "tests/resample/test_datetime_index.py::test_resample_upsampling_picked_but_not_correct[us]": "TODO: Add a reason for failure",
-    "tests/resample/test_period_index.py::TestPeriodIndex::test_evenly_divisible_with_no_extra_bins": "AssertionError: DataFrame.index are different",
     "tests/resample/test_period_index.py::TestPeriodIndex::test_evenly_divisible_with_no_extra_bins2": "TODO: Add a reason for failure",
-    "tests/resample/test_period_index.py::TestPeriodIndex::test_resample_tz_localized2": "AssertionError: (<Day>, None)",
     "tests/resample/test_period_index.py::TestPeriodIndex::test_resample_tz_localized[ms]": "TODO: Add a reason for failure",
     "tests/resample/test_period_index.py::TestPeriodIndex::test_resample_tz_localized[ns]": "TODO: Add a reason for failure",
     "tests/resample/test_period_index.py::TestPeriodIndex::test_resample_tz_localized[s]": "TODO: Add a reason for failure",
     "tests/resample/test_period_index.py::TestPeriodIndex::test_resample_tz_localized[us]": "TODO: Add a reason for failure",
     "tests/resample/test_period_index.py::TestPeriodIndex::test_with_local_timezone[tz0]": "TODO: Add a reason for failure",
     "tests/resample/test_period_index.py::TestPeriodIndex::test_with_local_timezone[tz1]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_col_resample-agg0]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_col_resample-agg1]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_col_resample-agg2]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_grouper_resample-agg0]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_grouper_resample-agg1]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_grouper_resample-agg2]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_mult_resample-agg0]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_mult_resample-agg1]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_mult_resample-agg2]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_resample-agg0]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_resample-agg1]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_both_mean_std_named_result[df_resample-agg2]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_agg_both_mean_sum[df_col_resample-agg1]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_agg_both_mean_sum[df_grouper_resample-agg1]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_agg_both_mean_sum[df_mult_resample-agg1]": "TODO: Add a reason for failure",
@@ -5916,40 +5815,15 @@ def pytest_unconfigure(config):
     "tests/resample/test_resample_api.py::test_agg_specificationerror_series[df_grouper_resample-agg0]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_agg_specificationerror_series[df_mult_resample-agg0]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_agg_specificationerror_series[df_resample-agg0]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_with_lambda[df_col_resample-agg0]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_with_lambda[df_col_resample-agg1]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_with_lambda[df_col_resample-agg2]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_with_lambda[df_grouper_resample-agg0]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_agg_with_lambda[df_grouper_resample-agg1]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_agg_with_lambda[df_grouper_resample-agg2]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_with_lambda[df_mult_resample-agg0]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_with_lambda[df_mult_resample-agg1]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_with_lambda[df_mult_resample-agg2]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_with_lambda[df_resample-agg0]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_with_lambda[df_resample-agg1]": "TODO: Add a reason for failure",
-    "tests/resample/test_resample_api.py::test_agg_with_lambda[df_resample-agg2]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_groupby_resample_api": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_transform_frame[None]": "TODO: Add a reason for failure",
     "tests/resample/test_resample_api.py::test_transform_frame[date]": "TODO: Add a reason for failure",
-    "tests/resample/test_resampler_grouper.py::test_apply_columns_multilevel": "AssertionError: (<Hour>, None)",
     "tests/resample/test_resampler_grouper.py::test_deferred_with_groupby": "TODO: Add a reason for failure",
     "tests/resample/test_resampler_grouper.py::test_getitem": "AssertionError: Series.index are different",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[count]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[first]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[last]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[max]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[mean]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[median]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[min]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[nunique]": "TODO: Add a reason for failure",
     "tests/resample/test_time_grouper.py::test_aggregate_normal[ohlc]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[prod]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[quantile]": "TODO: Add a reason for failure",
     "tests/resample/test_time_grouper.py::test_aggregate_normal[sem]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[size]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[std]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[sum]": "TODO: Add a reason for failure",
-    "tests/resample/test_time_grouper.py::test_aggregate_normal[var]": "TODO: Add a reason for failure",
     "tests/resample/test_time_grouper.py::test_apply_iteration": "TODO: Add a reason for failure",
     "tests/resample/test_time_grouper.py::test_groupby_resample_interpolate_with_apply_syntax": "AssertionError: DataFrame.index level [1] are different",
     "tests/resample/test_time_grouper.py::test_groupby_resample_interpolate_with_apply_syntax_off_grid": "AssertionError: DataFrame.index level [1] are different",
@@ -6236,7 +6110,6 @@ def pytest_unconfigure(config):
     "tests/reshape/test_cut.py::test_cut_pass_series_name_to_factor": "TODO: Add a reason for failure",
     "tests/reshape/test_cut.py::test_cut_unordered_labels[data0-bins0-labels0-expected_codes0-expected_labels0]": "TODO: Add a reason for failure",
     "tests/reshape/test_cut.py::test_cut_unordered_labels[data1-bins1-labels1-expected_codes1-expected_labels1]": "TODO: Add a reason for failure",
-    "tests/reshape/test_cut.py::test_datetime_bin[<lambda>1]": "AssertionError: Series category.categories are different",
     "tests/reshape/test_cut.py::test_datetime_nan_error": "TODO: Add a reason for failure",
     "tests/reshape/test_cut.py::test_label_precision": "TODO: Add a reason for failure",
     "tests/reshape/test_cut.py::test_no_right": "TODO: Add a reason for failure",
@@ -7286,7 +7159,6 @@ def pytest_unconfigure(config):
     "tests/strings/test_split_partition.py::test_split_nan_expand[string=object]": "AssertionError: DataFrame.iloc[:, 0] (column name='0') are different",
     "tests/strings/test_strings.py::test_index_str_accessor_multiindex_raises": "TODO: Add a reason for failure",
     "tests/strings/test_strings.py::test_split_join_roundtrip[string=string[pyarrow]]": "AssertionError: Series are different",
-    "tests/strings/test_strings.py::test_split_join_roundtrip[string=string[python]]": "AssertionError: Series are different",
     "tests/strings/test_strings.py::test_string_slice_out_of_bounds[string=object]": "AssertionError: Series are different",
     "tests/test_algos.py::TestFactorize::test_basic": "TODO: Add a reason for failure",
     "tests/test_algos.py::TestFactorize::test_datetime64_factorize[False]": "TODO: Add a reason for failure",
@@ -7449,10 +7321,6 @@ def pytest_unconfigure(config):
     "tests/tools/test_to_datetime.py::TestOrigin::test_julian": "AssertionError: Attributes of Series are different",
     "tests/tools/test_to_datetime.py::TestOrigin::test_to_datetime_out_of_bounds_with_format_arg[%Y-%d-%m %H:%M:%S-None]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestOrigin::test_to_datetime_out_of_bounds_with_format_arg[%Y-%m-%d %H:%M:%S-None]": "TODO: Add a reason for failure",
-    "tests/tools/test_to_datetime.py::TestOrigin::test_to_datetime_out_of_bounds_with_format_arg[None-UserWarning]": "AssertionError: assert 1833 == 2417",
-    "tests/tools/test_to_datetime.py::TestTimeConversionFormats::test_to_datetime_format_YYYYMMDD_with_none[input_s4]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestTimeConversionFormats::test_to_datetime_format_YYYYMMDD_with_none[input_s5]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestTimeConversionFormats::test_to_datetime_format_YYYYMMDD_with_none[input_s6]": "AssertionError: Attributes of Series are different",
     "tests/tools/test_to_datetime.py::TestTimeConversionFormats::test_to_datetime_format_weeks[False-2013020-%Y%U%w-2013-01-13]": "AssertionError: assert Timestamp('2013-01-19 00:00:00') == Timestamp('2013-01-13 00:00:00')",
     "tests/tools/test_to_datetime.py::TestTimeConversionFormats::test_to_datetime_format_weeks[True-2013020-%Y%U%w-2013-01-13]": "AssertionError: assert Timestamp('2013-01-19 00:00:00') == Timestamp('2013-01-13 00:00:00')",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_datetime_bool[False-False]": "TODO: Add a reason for failure",
@@ -7464,7 +7332,6 @@ def pytest_unconfigure(config):
     "tests/tools/test_to_datetime.py::TestToDatetime::test_datetime_invalid_scalar[None-00:01:99]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_datetime_invalid_scalar[None-a]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_datetime_outofbounds_scalar[%H:%M:%S-3000/12/11 00:00:00]": "TODO: Add a reason for failure",
-    "tests/tools/test_to_datetime.py::TestToDatetime::test_datetime_outofbounds_scalar[None-3000/12/11 00:00:00]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_error_iso_week_year[coerce-Day of the year directive '%j' is not compatible with ISO year directive '%G'. Use '%Y' instead.-1999 50-%G %j]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_error_iso_week_year[coerce-Day of the year directive '%j' is not compatible with ISO year directive '%G'. Use '%Y' instead.-1999 51 6 256-%G %V %u %j]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_error_iso_week_year[coerce-ISO week directive '%V' is incompatible with the year directive '%Y'. Use the ISO year '%G' instead.-1999 50-%Y %V]": "TODO: Add a reason for failure",
@@ -7487,10 +7354,6 @@ def pytest_unconfigure(config):
     "tests/tools/test_to_datetime.py::TestToDatetime::test_mixed_offsets_with_native_datetime_utc_false_raises": "assert False",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_arrow[index-None-False]": "AssertionError: assert DatetimeIndex([1965-04-03 00:00:00, 1965-04-17 00:00:00, 1965-05-01 00:00:00,\n       1965-05-...",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_arrow[index-US/Central-False]": "AssertionError: assert Index([1965-04-03 00:00:00-06:00, 1965-04-17 00:00:00-06:00,\n       1965-05-01 00:00:00-05:00...",
-    "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_converts_null_like_to_nat[False-input2]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_converts_null_like_to_nat[False-input3]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_converts_null_like_to_nat[True-input2]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_converts_null_like_to_nat[True-input3]": "AssertionError: Attributes of Series are different",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_dt64d_out_of_bounds[False]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_dt64d_out_of_bounds[True]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_dt64s_out_of_ns_bounds[False-coerce-dt0]": "TODO: Add a reason for failure",
@@ -7504,24 +7367,10 @@ def pytest_unconfigure(config):
     "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_dtarr[None]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_dtarr[US/Central]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_overflow": "TODO: Add a reason for failure",
-    "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_with_format_out_of_bounds[00010101]": "TODO: Add a reason for failure",
-    "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_with_format_out_of_bounds[13000101]": "TODO: Add a reason for failure",
-    "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_with_format_out_of_bounds[30000101]": "TODO: Add a reason for failure",
-    "tests/tools/test_to_datetime.py::TestToDatetime::test_to_datetime_with_format_out_of_bounds[99990101]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_week_without_day_and_calendar_year[2017-20-%Y-%W]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetime::test_week_without_day_and_calendar_year[2017-21-%Y-%U]": "TODO: Add a reason for failure",
-    "tests/tools/test_to_datetime.py::TestToDatetimeDataFrame::test_dataframe_field_aliases[False]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestToDatetimeDataFrame::test_dataframe_field_aliases[True]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestToDatetimeDataFrame::test_dataframe_field_aliases_column_subset[False-unit1]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestToDatetimeDataFrame::test_dataframe_field_aliases_column_subset[True-unit1]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestToDatetimeDataFrame::test_dataframe_int16[False]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestToDatetimeDataFrame::test_dataframe_int16[True]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestToDatetimeDataFrame::test_dataframe_mixed[False]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestToDatetimeDataFrame::test_dataframe_mixed[True]": "AssertionError: Attributes of Series are different",
     "tests/tools/test_to_datetime.py::TestToDatetimeDataFrame::test_dataframe_str_dtype[False]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetimeDataFrame::test_dataframe_str_dtype[True]": "TODO: Add a reason for failure",
-    "tests/tools/test_to_datetime.py::TestToDatetimeMisc::test_string_na_nat_conversion_with_name[False]": "AssertionError: Attributes of Series are different",
-    "tests/tools/test_to_datetime.py::TestToDatetimeMisc::test_string_na_nat_conversion_with_name[True]": "AssertionError: Attributes of Series are different",
     "tests/tools/test_to_datetime.py::TestToDatetimeMisc::test_to_datetime_barely_out_of_bounds": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetimeMisc::test_to_datetime_iso8601_exact_fails[2012-01-01 10-%Y-%m-%d]": "TODO: Add a reason for failure",
     "tests/tools/test_to_datetime.py::TestToDatetimeMisc::test_to_datetime_iso8601_exact_fails[2012-01-01 10:00-%Y-%m-%d %H]": "TODO: Add a reason for failure",
@@ -7741,31 +7590,12 @@ def pytest_unconfigure(config):
     "tests/window/test_rolling.py::test_closed_fixed_binary_col[False-5]": "TODO: Add a reason for failure",
     "tests/window/test_rolling.py::test_closed_fixed_binary_col[True-10]": "TODO: Add a reason for failure",
     "tests/window/test_rolling.py::test_closed_fixed_binary_col[True-5]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling.py::test_iter_rolling_datetime[expected0-expected_index0-1D]": "AssertionError: (<Day>, None)",
-    "tests/window/test_rolling.py::test_iter_rolling_datetime[expected1-expected_index1-2D]": "AssertionError: (<Day>, None)",
-    "tests/window/test_rolling.py::test_iter_rolling_datetime[expected2-expected_index2-3D]": "AssertionError: (<Day>, None)",
     "tests/window/test_rolling.py::test_rolling_non_monotonic[mean-expected1]": "TODO: Add a reason for failure",
     "tests/window/test_rolling.py::test_rolling_non_monotonic[sum-expected2]": "TODO: Add a reason for failure",
     "tests/window/test_rolling.py::test_rolling_non_monotonic[var-expected0]": "TODO: Add a reason for failure",
     "tests/window/test_rolling.py::test_rolling_var_same_value_count_logic[values0-3-1-expected0]": "TODO: Add a reason for failure",
     "tests/window/test_rolling.py::test_variable_window_nonunique[DataFrame-right-expected2]": "TODO: Add a reason for failure",
     "tests/window/test_rolling.py::test_variable_window_nonunique[Series-right-expected2]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_max_gh6297[10]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_max_gh6297[1]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_max_gh6297[2]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_max_gh6297[5]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_max_gh6297[None]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_max_resample[10]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_max_resample[1]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_max_resample[2]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_max_resample[5]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_max_resample[None]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_median_resample": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_min_resample[10]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_min_resample[1]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_min_resample[2]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_min_resample[5]": "TODO: Add a reason for failure",
-    "tests/window/test_rolling_functions.py::test_rolling_min_resample[None]": "TODO: Add a reason for failure",
     "tests/window/test_timeseries_window.py::TestRollingTS::test_invalid_minp[1.0]": "TODO: Add a reason for failure",
     "tests/window/test_timeseries_window.py::TestRollingTS::test_invalid_minp[foo]": "TODO: Add a reason for failure",
     "tests/window/test_timeseries_window.py::TestRollingTS::test_invalid_minp[minp2]": "TODO: Add a reason for failure",
diff --git a/python/cudf/cudf/tests/general_functions/test_to_datetime.py b/python/cudf/cudf/tests/general_functions/test_to_datetime.py
index 72a159f148c..dc47cf1ea86 100644
--- a/python/cudf/cudf/tests/general_functions/test_to_datetime.py
+++ b/python/cudf/cudf/tests/general_functions/test_to_datetime.py
@@ -308,3 +308,102 @@ def test_to_datetime_errors_non_scalar_not_implemented(errors):
 def test_to_datetime_errors_ignore_deprecated():
     with pytest.warns(FutureWarning):
         cudf.to_datetime("2001-01-01 00:04:45", errors="ignore")
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # all-None object inputs land on [s] in pandas 3 — see
+        # pandas-dev/pandas#55901 (NPY_FR_GENERIC fallback to "s").
+        pd.Series([None, None, None]),
+        pd.Series([None] * 5),
+        pd.Series([], dtype="object"),
+    ],
+)
+def test_to_datetime_all_null_object_returns_seconds(data):
+    expected = pd.to_datetime(data)
+    actual = cudf.to_datetime(cudf.from_pandas(data))
+    assert actual.dtype == expected.dtype
+    assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "scalar",
+    ["1/1/2000", "2020-01-01", "2020-01-01 12:34:56"],
+)
+def test_to_datetime_scalar_string_returns_us(scalar):
+    # Scalar string parsing should land on [us] (pandas 3 default).
+    expected = pd.to_datetime(scalar)
+    actual = cudf.to_datetime(scalar)
+    assert actual.unit == expected.unit
+    assert actual == expected
+
+
+@pytest.mark.parametrize(
+    "values",
+    [
+        [19801222, 20010112, None],
+        [19801222, 20010112, np.nan],
+        [19801222, 20010112],
+    ],
+)
+def test_to_datetime_int_with_format_us(values):
+    # Float-with-format path (triggered when a None/nan widens int -> float)
+    # must land on [us] regardless of whether the format contains "%f".
+    expected = pd.to_datetime(values, format="%Y%m%d")
+    actual = cudf.to_datetime(values, format="%Y%m%d")
+    assert actual.dtype == expected.dtype
+    assert_eq(actual, expected, check_exact=False)
+
+
+@pytest.mark.parametrize(
+    "df",
+    [
+        pd.DataFrame({"year": [2015, 2016], "month": [2, 3], "day": [4, 5]}),
+        pd.DataFrame(
+            {
+                "year": [2015, 2016],
+                "month": [2, 3],
+                "day": [4, 5],
+            }
+        ).astype("int16"),
+        pd.DataFrame(
+            {
+                "year": [2015, 2016],
+                "month": [2, 3],
+                "day": [4, 5],
+                "hour": [6, 7],
+                "minute": [58, 59],
+                "second": [10, 11],
+            }
+        ),
+    ],
+)
+def test_to_datetime_dataframe_default_us(df):
+    # DataFrame -> datetime defaults to [us] in pandas 3 (was [s] in cuDF).
+    expected = pd.to_datetime(df)
+    actual = cudf.to_datetime(cudf.from_pandas(df))
+    assert actual.dtype == expected.dtype
+    assert_eq(actual, expected)
+
+
+def test_to_datetime_dataframe_with_ns_field_widens_to_ns():
+    # When a ns field is explicitly present, the result must widen to [ns]
+    # (and the integer factor arithmetic must not lose the trailing ns).
+    df = pd.DataFrame(
+        {
+            "year": [2015, 2016],
+            "month": [2, 3],
+            "day": [4, 5],
+            "hour": [6, 7],
+            "minute": [58, 59],
+            "second": [10, 11],
+            "ms": [1, 1],
+            "us": [2, 2],
+            "ns": [3, 3],
+        }
+    )
+    expected = pd.to_datetime(df)
+    actual = cudf.to_datetime(cudf.from_pandas(df))
+    assert actual.dtype == expected.dtype
+    assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/groupby/test_resample.py b/python/cudf/cudf/tests/groupby/test_resample.py
index bdd82dc5baa..2c3b9f5aa60 100644
--- a/python/cudf/cudf/tests/groupby/test_resample.py
+++ b/python/cudf/cudf/tests/groupby/test_resample.py
@@ -156,22 +156,22 @@ def test_dataframe_resample_level():
 
 
 @pytest.mark.parametrize(
-    "in_freq, sampling_freq, out_freq",
+    "in_freq, sampling_freq",
     [
-        ("1ns", "1us", "us"),
-        ("1us", "10us", "us"),
-        ("ms", "100us", "us"),
-        ("ms", "1s", "s"),
-        ("s", "1min", "s"),
-        ("1min", "30s", "s"),
-        ("1D", "10D", "s"),
-        ("10D", "1D", "s"),
+        ("1ns", "1us"),
+        ("1us", "10us"),
+        ("ms", "100us"),
+        ("ms", "1s"),
+        ("s", "1min"),
+        ("1min", "30s"),
+        ("1D", "10D"),
+        ("10D", "1D"),
     ],
 )
-def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq):
+def test_resampling_frequency_conversion(in_freq, sampling_freq):
     rng = np.random.default_rng(seed=0)
-    # test that we cast to the appropriate frequency
-    # when resampling:
+    # Pandas resample preserves the input column's unit; verify cuDF
+    # matches that behavior across sampling frequencies.
     pdf = pd.DataFrame(
         {
             "x": rng.standard_normal(size=100),
@@ -183,7 +183,7 @@ def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq):
     got = gdf.resample(sampling_freq, on="y").mean()
     assert_resample_results_equal(expect, got)
 
-    assert got.index.dtype == np.dtype(f"datetime64[{out_freq}]")
+    assert got.index.dtype == pdf["y"].dtype
 
 
 def test_resampling_downsampling_ms():
@@ -198,3 +198,52 @@ def test_resampling_downsampling_ms():
     result = gdf.resample("10ms", on="time").mean()
     result.index = result.index.astype("datetime64[ns]")
     assert_eq(result, expected, check_freq=False)
+
+
+@pytest.mark.parametrize("input_unit", ["s", "ms", "us", "ns"])
+@pytest.mark.parametrize("freq", ["D", "h", "30min"])
+@pytest.mark.parametrize(
+    "agg", ["mean", "sum", "min", "max", "first", "last", "count", "var"]
+)
+def test_resample_empty_preserves_input_unit_and_freq(input_unit, freq, agg):
+    # Resample on an empty datetime index must preserve the input column's
+    # unit (pandas behavior; cuDF previously collapsed to [s] for D/h offsets)
+    # and must keep the offset attached to the result index.
+    idx = pd.DatetimeIndex([], dtype=f"datetime64[{input_unit}]", name="t")
+    pser = pd.Series([], index=idx, dtype=float)
+    gser = cudf.from_pandas(pser)
+
+    expected = getattr(pser.resample(freq), agg)()
+    actual = getattr(gser.resample(freq), agg)()
+
+    assert actual.index.dtype == expected.index.dtype
+    assert_eq(actual, expected, check_dtype=False, check_index_type=False)
+
+
+def test_resample_size_matches_pandas_with_empty_buckets():
+    # GroupBy.size bypasses _Resampler.agg, so the freq has to be re-attached
+    # by the size override; empty buckets must come back as 0 (not NaN), and
+    # the result must be sorted by bin label like pandas.
+    idx = pd.date_range("2020-01-01", periods=4, freq="1h")
+    pser = pd.Series(range(4), index=idx)
+    gser = cudf.from_pandas(pser)
+
+    expected = pser.resample("30min").size()
+    actual = gser.resample("30min").size()
+
+    assert_eq(actual, expected, check_dtype=False, check_index_type=False)
+
+    # Empty case
+    idx_empty = pd.DatetimeIndex([], dtype="datetime64[us]", name="t")
+    pser_empty = pd.Series([], index=idx_empty, dtype=float)
+    gser_empty = cudf.from_pandas(pser_empty)
+
+    expected_empty = pser_empty.resample("h").size()
+    actual_empty = gser_empty.resample("h").size()
+
+    assert_eq(
+        actual_empty,
+        expected_empty,
+        check_dtype=False,
+        check_index_type=False,
+    )
diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_to_pandas.py b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_to_pandas.py
index b2567237ec0..f5331769793 100644
--- a/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_to_pandas.py
+++ b/python/cudf/cudf/tests/indexes/datetimeindex/methods/test_to_pandas.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pandas as pd
+import pytest
 
 import cudf
 from cudf.testing import assert_eq
@@ -57,3 +58,29 @@ def test_to_pandas_externally_set_stale_freq_matches_pandas_inferred():
 
     assert actual.freq == pd.tseries.frequencies.to_offset(expected_freq)
     assert_eq(actual.values, pidx.values)
+
+
+@pytest.mark.parametrize("freq", ["D", "h", "30min", "2D", "ME"])
+def test_to_pandas_empty_with_freq_falls_back_to_cached(freq):
+    # Empty indexes have nothing to infer from, so to_pandas() must fall back
+    # to the cached freq mapped through DateOffset._maybe_as_fast_pandas_offset
+    # (matches pandas, which keeps the offset on empty resample/asfreq output).
+    pidx = pd.DatetimeIndex([], dtype="datetime64[us]", freq=freq, name="t")
+    gidx = cudf.from_pandas(pidx)
+
+    actual = gidx.to_pandas()
+    assert actual.freq == pidx.freq
+    assert actual.dtype == pidx.dtype
+    assert_eq(actual, pidx)
+
+
+@pytest.mark.parametrize("freq", ["D", "h", "30min"])
+def test_to_pandas_single_element_with_freq_falls_back_to_cached(freq):
+    # Single-element indexes can't infer freq (pandas inferred_freq is None),
+    # but the cached freq is still authoritative — preserve it on round-trip.
+    pidx = pd.DatetimeIndex(["2020-01-01"], freq=freq, name="t")
+    gidx = cudf.from_pandas(pidx)
+
+    actual = gidx.to_pandas()
+    assert actual.freq == pidx.freq
+    assert_eq(actual, pidx)
diff --git a/python/cudf/cudf/tests/indexes/datetimeindex/test_constructor.py b/python/cudf/cudf/tests/indexes/datetimeindex/test_constructor.py
index 795455aeebe..1e91ffda8b8 100644
--- a/python/cudf/cudf/tests/indexes/datetimeindex/test_constructor.py
+++ b/python/cudf/cudf/tests/indexes/datetimeindex/test_constructor.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 
@@ -98,3 +98,44 @@ def test_from_pandas_datetimeindex_freq():
     actual = cudf.Index(expected)
     assert_eq(expected, actual)
     assert actual.freq is not None
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [],
+        (),
+        [None],
+        [None, None],
+        pd.Series([], dtype="object"),
+    ],
+)
+def test_datetime_index_empty_object_default_dtype(data):
+    # Pandas's array_to_datetime falls back to "s" precision when no
+    # concrete (non-NaT) datetime is observed (pandas-dev/pandas#55901).
+    # cuDF should match so cudf.pandas-wrapped pd.DatetimeIndex([]) doesn't
+    # diverge from plain pandas. Inputs to this test deliberately carry no
+    # explicit dtype so cuDF's default-unit logic is what's exercised.
+    pd_data = data
+    gd_data = cudf.from_pandas(data) if isinstance(data, pd.Series) else data
+    expected = pd.DatetimeIndex(pd_data)
+    actual = cudf.DatetimeIndex(gd_data)
+    assert actual.dtype == expected.dtype
+    assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "values",
+    [
+        ["2020-01-01"],
+        ["2020-01-01", "2020-01-02"],
+        ["1970-01-01 00:00:00.000000"],
+    ],
+)
+def test_datetime_index_string_input_us_default(values):
+    # Non-empty string parsing should still resolve to [us] (pandas 3 default
+    # when no nanosecond-precision component is present).
+    expected = pd.DatetimeIndex(values)
+    actual = cudf.DatetimeIndex(values)
+    assert actual.dtype == expected.dtype
+    assert_eq(actual, expected)

From 31732df7d02d13ea53555d5f2a275c85abcf2331 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 5 May 2026 16:57:45 -0500
Subject: [PATCH 11/36] Expose additional proxy attributes for IntervalArray,
 Styler, and _MethodProxy (#22374)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Summary

Three pandas-tests xfail entries surfaced `AttributeError` failures that
were just missing entries in the proxy `additional_attributes` (or on
`_MethodProxy` itself).

## Changes

- **`IntervalArray` proxy** now exposes `_left` and `_right` (private),
matching the existing `_data`/`_mask` plumbing. Fixes
`test_series_from_temporary_intervalindex_readonly_data`.
- **`Styler` proxy** now exposes `_compute`,
`_display_funcs_column_names`, and `_display_funcs_index_names` (all
private). Fixes
`test_format_index_names_clear[_display_funcs_column_names-kwargs1]` and
`[_display_funcs_index_names-kwargs0]`.
- **`_MethodProxy`** now exposes `__func__` (forwarded to the slow
underlying method), mirroring the existing `__name__` and `__doc__`
properties. This is required for callers that introspect classmethod
descriptors via `type(x).method.__func__`.

## Conftest

Removed three `NODEIDS_THAT_FAIL` entries whose underlying tests now
pass.

## Notes on remaining `AttributeError` xfails

Audited the remaining 17 `AttributeError` xfail entries; they fall into
a few buckets that need deeper changes (out of scope for this PR):

- **Slow-side `isinstance` failures** (`Styler._compute`,
`'DataFrame'/'SubclassedDataFrame' object has no attribute 'dtype'`):
the slow-side function's `__globals__` was bound at import time before
the proxy classes were installed, so `isinstance(proxy_df,
real_DataFrame)` is `False` inside the slow module. Needs a different
mechanism than `additional_attributes`.
- **Mixed-type Series limitations** (`quantile_box`, `quantile_box_nat`,
`quantile_date_range`, `quantile_ea_scalar`): cuDF documents that it
returns a `DataFrame` instead of a `Series` when the result would be
mixed-type — the proxy preserves that type, breaking downstream
`assert_series_equal`.
- **`.values` returning ndarray for nullable dtypes**
(`test_construct_from_dict_ea_series`): pure pandas returns
`IntegerArray`; cuDF returns `ndarray`.
- **Other one-offs** (`SparseArray.reshape`, abstract
`_from_sequence_of_strings`, custom accessor `xyz`, loc setitem datetime
parsing, `_fsproxy_slow` proxy-conversion failure): each needs its own
targeted fix.

Co-authored-by: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 python/cudf/cudf/pandas/_wrappers/pandas.py       | 9 +++++++++
 python/cudf/cudf/pandas/fast_slow_proxy.py        | 4 ++++
 python/cudf/cudf/pandas/scripts/conftest-patch.py | 3 ---
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 0bdc05205f5..f0b0158d87b 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -1035,6 +1035,8 @@ def Index__setattr__(self, name, value):
         "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "_left": _FastSlowAttribute("_left", private=True),
+        "_right": _FastSlowAttribute("_right", private=True),
     },
 )
 
@@ -1363,6 +1365,7 @@ def Index__setattr__(self, name, value):
             "_display_funcs": _FastSlowAttribute(
                 "_display_funcs", private=True
             ),
+            "_compute": _FastSlowAttribute("_compute", private=True),
             "table_styles": _FastSlowAttribute("table_styles"),
             "columns": _FastSlowAttribute("columns"),
             "caption": _FastSlowAttribute("caption"),
@@ -1372,6 +1375,12 @@ def Index__setattr__(self, name, value):
             "_display_funcs_index": _FastSlowAttribute(
                 "_display_funcs_index", private=True
             ),
+            "_display_funcs_column_names": _FastSlowAttribute(
+                "_display_funcs_column_names", private=True
+            ),
+            "_display_funcs_index_names": _FastSlowAttribute(
+                "_display_funcs_index_names", private=True
+            ),
             "uuid": _FastSlowAttribute("uuid"),
             "hide_index_": _FastSlowAttribute("hide_index_"),
             "hide_index_names": _FastSlowAttribute("hide_index_names"),
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 91119a5519d..985cce7c644 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1125,6 +1125,10 @@ def __dir__(self):
     def __doc__(self):
         return self._fsproxy_slow.__doc__
 
+    @property
+    def __func__(self):
+        return self._fsproxy_slow.__func__
+
     @property
     def __name__(self):
         return self._fsproxy_slow.__name__
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 373e0bf1670..7b1e709496d 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -766,7 +766,6 @@ def pytest_unconfigure(config):
     "tests/copy_view/index/test_index.py::test_index_to_frame": "TODO: Add a reason for failure",
     "tests/copy_view/index/test_index.py::test_index_values": "TODO: Add a reason for failure",
     "tests/copy_view/index/test_index.py::test_set_index_series": "TODO: Add a reason for failure",
-    "tests/copy_view/index/test_intervalindex.py::test_series_from_temporary_intervalindex_readonly_data": "AttributeError: 'IntervalArray' object has no attribute '_left'. Did you mean: 'left'?",
     "tests/copy_view/test_array.py::test_dataframe_array_ea_dtypes[np.array]": "AssertionError: assert False",
     "tests/copy_view/test_array.py::test_dataframe_array_ea_dtypes[np.asarray-dtype]": "AssertionError: assert False",
     "tests/copy_view/test_array.py::test_dataframe_array_ea_dtypes[np.asarray]": "AssertionError: assert False",
@@ -5152,8 +5151,6 @@ def pytest_unconfigure(config):
     "tests/io/excel/test_writers.py::TestExcelWriterEngineTests::test_register_writer": "TODO: Add a reason for failure",
     "tests/io/excel/test_writers.py::TestFSPath::test_excelfile_fspath": "TODO: Add a reason for failure",
     "tests/io/formats/style/test_format.py::test_format_clear[format_index-_display_funcs_columns-kwargs2]": "TODO: Add a reason for failure",
-    "tests/io/formats/style/test_format.py::test_format_index_names_clear[_display_funcs_column_names-kwargs1]": "AttributeError: 'Styler' object has no attribute '_display_funcs_column_names'",
-    "tests/io/formats/style/test_format.py::test_format_index_names_clear[_display_funcs_index_names-kwargs0]": "AttributeError: 'Styler' object has no attribute '_display_funcs_index_names'",
     "tests/io/formats/style/test_html.py::test_from_custom_template_style": "TODO: Add a reason for failure",
     "tests/io/formats/style/test_html.py::test_from_custom_template_table": "TODO: Add a reason for failure",
     "tests/io/formats/style/test_html.py::test_replaced_css_class_names": "TODO: Add a reason for failure",

From 4aa57e5091b766221ac6f54f4ecfede71c40b254 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Tue, 5 May 2026 23:25:55 +0200
Subject: [PATCH 12/36] Multi-rank fixes for cudf-polars streaming (#22361)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fix bugs that appear when running with `num_ranks > 1`, where client-side `pl.concat(per_rank_outputs)` exposes assumptions that do not hold under single-rank execution.

These were all discovered while working on multi-rank tests.

**NB:** Please take a close look during review, as I’m still a bit unfamiliar with the IR part of cudf-polars.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/22361
---
 .../experimental/rapidsmpf/join.py            | 12 ++++-
 .../experimental/rapidsmpf/union.py           | 46 ++++++++++++-------
 .../cudf_polars/experimental/select.py        |  8 +++-
 .../tests/experimental/test_dataframescan.py  |  9 ++++
 python/cudf_polars/tests/test_groupby.py      |  9 +++-
 5 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/join.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/join.py
index abb2e7082f0..b36b07342ce 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/join.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/join.py
@@ -205,9 +205,19 @@ async def _collect_small_side_for_broadcast(
             for s_id in range(len(chunks)):
                 inserter.insert(s_id, chunks.pop(0))
         stream = ir_context.get_cuda_stream()
+        gathered = await allgather.extract_concatenated(stream)
+        # When every rank inserted zero chunks, the AllGather has no schema
+        # to infer and returns a 0-column table. Substitute a properly typed
+        # empty table for the small side so downstream joins still match the
+        # expected schema.
+        table = (
+            empty_table_chunk(ir, context, stream).table_view()
+            if gathered.num_columns() == 0 and len(ir.schema) > 0
+            else gathered
+        )
         dfs = [
             DataFrame.from_table(
-                await allgather.extract_concatenated(stream),
+                table,
                 list(ir.schema.keys()),
                 list(ir.schema.values()),
                 stream,
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/union.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/union.py
index b4cb6a922b9..2484620234d 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/union.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/union.py
@@ -24,6 +24,7 @@
 )
 
 if TYPE_CHECKING:
+    from rapidsmpf.communicator.communicator import Communicator
     from rapidsmpf.streaming.core.channel import Channel
     from rapidsmpf.streaming.core.context import Context
 
@@ -34,6 +35,7 @@
 @define_actor()
 async def union_node(
     context: Context,
+    comm: Communicator,
     ir: Union,
     ir_context: IRExecutionContext,
     ch_out: Channel[TableChunk],
@@ -46,6 +48,9 @@ async def union_node(
     ----------
     context
         The rapidsmpf context.
+    comm
+        The communicator. Used to suppress duplicated children's chunks on
+        non-root ranks so they aren't emitted twice cluster-wide.
     ir
         The Union IR node.
     ir_context
@@ -61,14 +66,19 @@ async def union_node(
         # Merge and forward metadata.
         # Union loses partitioning/ordering info since sources may differ.
         # TODO: Warn users that Union does NOT preserve order?
-        total_local_count = 0
-        duplicated = True
         metadata = await gather_in_task_group(
             *(recv_metadata(ch, context) for ch in chs_in)
         )
-        for meta in metadata:
-            total_local_count += meta.local_count
-            duplicated = duplicated and meta.duplicated
+        # When a child has duplicated=True, every rank has produced the same
+        # data and only rank 0 should forward it -- otherwise the downstream
+        # client-side concat would over-count by `nranks - 1` for each
+        # duplicated chunk.
+        skip = tuple(meta.duplicated and comm.rank != 0 for meta in metadata)
+        total_local_count = sum(
+            0 if drop else meta.local_count
+            for meta, drop in zip(metadata, skip, strict=True)
+        )
+        duplicated = all(meta.duplicated for meta in metadata)
         await send_metadata(
             ch_out,
             context,
@@ -79,21 +89,22 @@ async def union_node(
         )
 
         seq_num_offset = 0
-        for ch_in in chs_in:
+        for ch_in, drop in zip(chs_in, skip, strict=True):
             num_ch_chunks = 0
             while (msg := await ch_in.recv(context)) is not None:
-                num_ch_chunks += 1
-                await ch_out.send(
-                    context,
-                    Message(
-                        msg.sequence_number + seq_num_offset,
-                        TableChunk.from_message(
-                            msg, br=context.br()
-                        ).make_available_and_spill(
-                            context.br(), allow_overbooking=True
+                if not drop:
+                    await ch_out.send(
+                        context,
+                        Message(
+                            msg.sequence_number + seq_num_offset,
+                            TableChunk.from_message(
+                                msg, br=context.br()
+                            ).make_available_and_spill(
+                                context.br(), allow_overbooking=True
+                            ),
                         ),
-                    ),
-                )
+                    )
+                    num_ch_chunks += 1
             seq_num_offset += num_ch_chunks
 
         await ch_out.drain(context)
@@ -116,6 +127,7 @@ def _(
     nodes[ir] = [
         union_node(
             rec.state["context"],
+            rec.state["comm"],
             ir,
             rec.state["ir_context"],
             channels[ir].reserve_input_slot(),
diff --git a/python/cudf_polars/cudf_polars/experimental/select.py b/python/cudf_polars/cudf_polars/experimental/select.py
index 25d0189fdf6..9ab30f9be13 100644
--- a/python/cudf_polars/cudf_polars/experimental/select.py
+++ b/python/cudf_polars/cudf_polars/experimental/select.py
@@ -431,13 +431,17 @@ def _(
         )
         named_expr = expr.NamedExpr(ir.exprs[0].name or "len", lit_expr)
 
+        # Use Empty as the input so the streaming network's metadata flows
+        # `duplicated=True` end-to-end. Without that, every rank emits the
+        # literal once and the client concatenates N copies.
+        input_ir: IR = Empty({})
         new_node = Select(
             {named_expr.name: named_expr.value.dtype},
             [named_expr],
             should_broadcast=True,
-            df=child,
+            df=input_ir,
         )
-        partition_info[new_node] = PartitionInfo(count=1)
+        partition_info[input_ir] = partition_info[new_node] = PartitionInfo(count=1)
         return new_node, partition_info
 
     if not any(
diff --git a/python/cudf_polars/tests/experimental/test_dataframescan.py b/python/cudf_polars/tests/experimental/test_dataframescan.py
index 57684734fea..dbf22848824 100644
--- a/python/cudf_polars/tests/experimental/test_dataframescan.py
+++ b/python/cudf_polars/tests/experimental/test_dataframescan.py
@@ -60,6 +60,15 @@ def test_parallel_dataframescan(df, streaming_engine_factory, max_rows_per_parti
         assert count == 1
 
 
+@pytest.mark.xfail(
+    reason=(
+        "Multi-rank Union interleaves child outputs across ranks: client "
+        "receives [rank0_A, rank0_B, rank1_A, rank1_B] instead of the "
+        "polars-CPU [A, B]. Tracked in "
+        "https://github.com/rapidsai/cudf/issues/22376."
+    ),
+    strict=False,
+)
 def test_dataframescan_concat(df, streaming_engine_factory):
     streaming_engine = streaming_engine_factory(
         StreamingOptions(max_rows_per_partition=1_000),
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index a14177b9f0c..f14160a1043 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -501,8 +501,13 @@ def test_groupby_sum_decimal_null_group(engine: pl.GPUEngine) -> None:
 
 
 @pytest.mark.xfail(
-    raises=AssertionError,
-    reason="https://github.com/rapidsai/cudf/issues/19610",
+    raises=(AssertionError, pl.exceptions.SchemaError),
+    reason=(
+        "https://github.com/rapidsai/cudf/issues/19610 — in-memory engine "
+        "fails with AssertionError (wrong values); multi-rank streaming "
+        "fails earlier with SchemaError (literal agg yields a divergent "
+        "schema after cross-rank concat)."
+    ),
 )
 def test_groupby_literal_agg(engine: pl.GPUEngine):
     df = pl.LazyFrame({"c0": [True, False]})

From aa23377bbb79b99accbd986ffd72d5a864dbc225 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 5 May 2026 15:40:28 -0700
Subject: [PATCH 13/36] Fix reading of large CSV files (>64MB) (#22375)

Fixes a regression in #22237 where reading a CSV larger than the internal 64 MiB chunk size dropped all rows past the first chunk. Root cause is a misuse of a clamped value to determine the EOF state.

This PR fixes the EOF transition so it only happens in the last chunk.

Also added a large test - all previous CSV tests were below the chunk threshold.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Basit Ayantunde (https://github.com/lamarrr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/22375
---
 cpp/src/io/csv/csv_gpu.cu |  3 ++-
 cpp/tests/io/csv_test.cpp | 46 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 48 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 3a407e03d99..90a112a9a43 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -736,7 +736,8 @@ CUDF_KERNEL void __launch_bounds__(rowofs_block_dim)
         ctx = make_char_context(ROW_CTX_NONE, ROW_CTX_QUOTE, ROW_CTX_NONE);
       }
     } else {
-      if (cur <= end && cur == data_end) {
+      bool const is_last_chunk = data_end_off <= data.size();
+      if (is_last_chunk && cur <= end && cur == data_end) {
         // Add a newline at data end (need the extra row offset to infer length of previous row)
         ctx = make_char_context(ROW_CTX_EOF, ROW_CTX_EOF, ROW_CTX_EOF, 1, 1, 1);
       } else {
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 7be01e80f4f..2cb96215ab6 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -12,9 +12,12 @@
 #include <cudf_test/testing_main.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/aggregation.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -1307,6 +1310,49 @@ TEST_F(CsvReaderTest, TypeInferenceEmptyDelimitedFields)
   expect_column_data_equal(std::vector<int64_t>{3, 6}, result_view.column(2));
 }
 
+TEST_F(CsvReaderTest, MultiChunkRowCount)
+{
+  // TODO: add reader option to set chunk size and use it here
+  constexpr size_t chunk_threshold = 64ull * 1024 * 1024;
+  std::string const row            = "123,456,789\n";
+  size_t const num_rows            = (chunk_threshold / row.size()) + 1024;
+
+  std::string buffer;
+  buffer.reserve(num_rows * row.size());
+  for (size_t i = 0; i < num_rows; ++i) {
+    buffer.append(row);
+  }
+
+  cudf::io::csv_reader_options const in_opts =
+    cudf::io::csv_reader_options::builder(
+      cudf::io::source_info{cudf::host_span<std::byte const>{
+        reinterpret_cast<std::byte const*>(buffer.data()), buffer.size()}})
+      .header(-1);
+  auto const result      = cudf::io::read_csv(in_opts);
+  auto const result_view = result.tbl->view();
+
+  ASSERT_EQ(result_view.num_columns(), 3);
+  EXPECT_EQ(static_cast<size_t>(result_view.num_rows()), num_rows);
+  EXPECT_EQ(result_view.column(0).type().id(), type_id::INT64);
+  EXPECT_EQ(result_view.column(1).type().id(), type_id::INT64);
+  EXPECT_EQ(result_view.column(2).type().id(), type_id::INT64);
+
+  // All rows are identical, so verifying min == max == expected
+  auto const i64       = cudf::data_type{cudf::type_id::INT64};
+  auto const min_agg   = cudf::make_min_aggregation<cudf::reduce_aggregation>();
+  auto const max_agg   = cudf::make_max_aggregation<cudf::reduce_aggregation>();
+  auto const all_equal = [&](cudf::column_view const& col, int64_t expected) {
+    using scalar_t = cudf::numeric_scalar<int64_t>;
+    auto const min = cudf::reduce(col, *min_agg, i64);
+    auto const max = cudf::reduce(col, *max_agg, i64);
+    return static_cast<scalar_t const&>(*min).value() == expected &&
+           static_cast<scalar_t const&>(*max).value() == expected;
+  };
+  EXPECT_TRUE(all_equal(result_view.column(0), 123));
+  EXPECT_TRUE(all_equal(result_view.column(1), 456));
+  EXPECT_TRUE(all_equal(result_view.column(2), 789));
+}
+
 TEST_F(CsvReaderTest, TypeInferenceThousands)
 {
   std::string buffer = "1`400,123,1`234.56\n123`456,123456,12.34";

From 4aa13f159c0998c1ea80deea4bb98510a4a509da Mon Sep 17 00:00:00 2001
From: Yunsong Wang <12716979+PointKernel@users.noreply.github.com>
Date: Tue, 5 May 2026 15:44:29 -0700
Subject: [PATCH 14/36] Add decimal128 to groupby_max_cardinality benchmark
 (#22162)

Closes #22154

This PR adds decimal128 values to the groupby_max_cardinality benchmark.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/22162
---
 cpp/benchmarks/groupby/group_max.cpp | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
index 29d5645d220..d837cfac665 100644
--- a/cpp/benchmarks/groupby/group_max.cpp
+++ b/cpp/benchmarks/groupby/group_max.cpp
@@ -1,15 +1,18 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/common/memory_stats.hpp>
 
+#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/groupby.hpp>
 
 #include <nvbench/nvbench.cuh>
 
+NVBENCH_DECLARE_TYPE_STRINGS(numeric::decimal128, "decimal128", "decimal128");
+
 template <typename Type>
 void groupby_max_helper(nvbench::state& state,
                         cudf::size_type num_rows,
@@ -26,8 +29,13 @@ void groupby_max_helper(nvbench::state& state,
   }();
 
   auto const make_values = [&]() {
-    auto builder = data_profile_builder().cardinality(0).distribution(
-      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    auto builder = data_profile_builder().cardinality(0);
+    if constexpr (cudf::is_fixed_point<Type>()) {
+      builder.distribution(
+        cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows, numeric::scale_type{0});
+    } else {
+      builder.distribution(cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    }
     if (null_probability > 0) {
       builder.null_probability(null_probability);
     } else {
@@ -91,7 +99,8 @@ NVBENCH_BENCH_TYPES(bench_groupby_max,
   .add_float64_axis("null_probability", {0, 0.1, 0.9})
   .add_int64_axis("num_aggregations", {1, 2, 4, 8, 16, 32});
 
-NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t>))
+NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, numeric::decimal128>))
   .set_name("groupby_max_cardinality")
   .add_int64_axis("num_aggregations", {1, 2, 3, 4, 5, 6, 7, 8})
   .add_int64_axis("cardinality", {20, 50, 100, 1'000, 10'000, 100'000, 1'000'000});

From c5cb03bedb3e8c8f6dfe8ac7cc53fa5441cedb9d Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Wed, 6 May 2026 00:52:34 +0200
Subject: [PATCH 15/36] `StreamingEngine._reset()` (#22364)

Generalizes `RayEngine._reset()` to `SPMDEngine` and `DaskEngine`. All three engines now swap the rapidsmpf `Context` and the Polars `GPUEngine` configuration in place, while keeping the communicator and RMM resource alive.

The test suite is refactored to take advantage of this. A session-scoped `streaming_engines` fixture bootstraps the SPMD communicator and constructs a shared `SPMDEngine`. Per-test `spmd_engine` and `streaming_engine_factory` fixtures call `_reset` on the cached engine instead of constructing a new one.

This pattern will be extended to `RayEngine` and `DaskEngine` tests in the future as they are incorporated into the test matrix.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/cudf/pull/22364
---
 .../experimental/rapidsmpf/frontend/core.py   |  94 ++++++
 .../experimental/rapidsmpf/frontend/dask.py   | 104 ++++++-
 .../experimental/rapidsmpf/frontend/ray.py    |  95 +-----
 .../experimental/rapidsmpf/frontend/spmd.py   | 104 ++++++-
 .../cudf_polars/testing/engine_utils.py       |  46 +--
 python/cudf_polars/tests/conftest.py          | 104 ++++---
 .../experimental/test_all_gather_host_data.py |  16 +-
 .../tests/experimental/test_allgather.py      |  11 +-
 .../tests/experimental/test_dask.py           |  94 ++++++
 .../tests/experimental/test_io_multirank.py   |   6 +-
 .../tests/experimental/test_ray.py            |   6 +-
 .../tests/experimental/test_sink.py           |   4 +-
 .../tests/experimental/test_spilling.py       | 198 ++++++------
 .../tests/experimental/test_spmd.py           | 293 +++++++++++-------
 .../tests/experimental/test_statistics.py     |   6 +-
 15 files changed, 781 insertions(+), 400 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/core.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/core.py
index 69e6a36dca2..7bc8dabddec 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/core.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/core.py
@@ -13,6 +13,7 @@
 
 import cuda.core
 from rapidsmpf.coll import AllGather
+from rapidsmpf.config import Options, get_environment_variables
 from rapidsmpf.memory.packed_data import PackedData
 from rapidsmpf.statistics import Statistics
 from rapidsmpf.streaming.core.actor import run_actor_network
@@ -50,6 +51,39 @@
 T = TypeVar("T")
 
 
+def resolve_rapidsmpf_options(rapidsmpf_options: Options | None) -> Options:
+    """
+    Resolve ``rapidsmpf_options`` and apply cross-frontend defaults.
+
+    If ``None`` is passed, constructs an ``Options`` instance from
+    environment variables. Then applies defaults that should be consistent
+    across SPMD, Ray, and Dask. Defaults are set via
+    ``Options.insert_if_absent``, so explicit values or environment
+    variables always take precedence.
+
+    Defaults applied:
+
+    - ``num_streaming_threads=4``: moderate worker count for the rapidsmpf
+      streaming runtime, shared across frontends.
+
+    Parameters
+    ----------
+    rapidsmpf_options
+        Existing options to resolve, or ``None`` to construct from environment
+        variables.
+
+    Returns
+    -------
+    Options
+        Resolved options with cross-frontend defaults applied.
+    """
+    if rapidsmpf_options is None:
+        rapidsmpf_options = Options(get_environment_variables())
+
+    rapidsmpf_options.insert_if_absent({"num_streaming_threads": "4"})
+    return rapidsmpf_options
+
+
 @dataclasses.dataclass(frozen=True)
 class ClusterInfo:
     """
@@ -201,6 +235,66 @@ def global_statistics(self, *, clear: bool = False) -> Statistics:
         """
         return Statistics.merge(self.gather_statistics(clear=clear))
 
+    def _reset(
+        self,
+        *,
+        rapidsmpf_options: Options | None = None,
+        executor_options: dict[str, Any] | None = None,
+        engine_options: dict[str, Any] | None = None,
+    ) -> None:
+        """
+        Reset the engine with new options, keeping cluster resources alive.
+
+        The following inputs are fixed at construction time and cannot change:
+          - ``num_ranks``
+          - ``num_py_executors`` (in ``executor_options``)
+          - ``hardware_binding`` (in ``engine_options``)
+          - ``memory_resource_config`` (in ``engine_options``)
+
+        Subclasses must override this method. The override should:
+          1. Raise :class:`RuntimeError` if the engine is already shut down.
+          2. Call ``super()._reset(...)`` to apply the universal option validation below.
+          3. Perform the backend-specific rebuild.
+
+        Parameters
+        ----------
+        rapidsmpf_options
+            New :class:`Options` for each rank's :class:`Context`.
+            ``None`` is treated as an empty dict.
+        executor_options
+            New executor options for the polars ``GPUEngine`` layer.
+            ``None`` is treated as an empty dict.
+        engine_options
+            New engine options for the polars ``GPUEngine`` layer.
+            ``None`` is treated as an empty dict.
+
+        Raises
+        ------
+        ValueError
+            If ``executor_options`` or ``engine_options`` contains a
+            construction-time-only key (see list above), or if a
+            reserved key is set (via :func:`check_reserved_keys`).
+        """
+        executor_options = executor_options or {}
+        engine_options = engine_options or {}
+        check_reserved_keys(executor_options, engine_options)
+
+        _disallowed_exec = {"num_py_executors"} & executor_options.keys()
+        if _disallowed_exec:
+            raise ValueError(
+                f"executor_options keys {sorted(_disallowed_exec)} cannot be "
+                "changed via _reset(). Construct a fresh engine instead."
+            )
+        _disallowed_engine = {
+            "hardware_binding",
+            "memory_resource_config",
+        } & engine_options.keys()
+        if _disallowed_engine:
+            raise ValueError(
+                f"engine_options keys {sorted(_disallowed_engine)} cannot be "
+                "changed via _reset(). Construct a fresh engine instead."
+            )
+
     def shutdown(self) -> None:
         """
         Shut down engine and release all owned resources.
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/dask.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/dask.py
index eb32abcf375..49810e998fd 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/dask.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/dask.py
@@ -19,10 +19,7 @@
 import ucxx._lib.libucxx as ucx_api
 from rapidsmpf import bootstrap
 from rapidsmpf.communicator.ucxx import barrier, get_root_ucxx_address, new_communicator
-from rapidsmpf.config import (
-    Options,
-    get_environment_variables,
-)
+from rapidsmpf.config import Options
 from rapidsmpf.progress_thread import ProgressThread
 from rapidsmpf.rmm_resource_adaptor import RmmResourceAdaptor
 from rapidsmpf.streaming.core.context import Context
@@ -36,6 +33,7 @@
     StreamingEngine,
     check_reserved_keys,
     evaluate_on_rank,
+    resolve_rapidsmpf_options,
 )
 from cudf_polars.experimental.rapidsmpf.frontend.hardware_binding import (
     HardwareBindingPolicy,
@@ -294,6 +292,47 @@ def _teardown_worker(
             delattr(dask_worker, attr)
 
 
+def _reset_worker(
+    rapidsmpf_options_as_bytes: bytes,
+    *,
+    uid: str,
+    dask_worker: distributed.Worker | None = None,
+) -> None:
+    """
+    Rebuild the streaming Context with new options.
+
+    Must be called collectively on all workers. A barrier ensures no
+    worker tears down its Context while peers may still be using it.
+
+    Parameters
+    ----------
+    rapidsmpf_options_as_bytes
+        Serialized :class:`Options` to install.
+    uid
+        Cluster instance identifier used to look up the per-worker context.
+    dask_worker
+        Injected by ``distributed`` when called via :meth:`distributed.Client.run`.
+    """
+    assert dask_worker is not None
+    attr = f"_cudf_polars_mp_context_{uid}"
+    mp_ctx: _WorkerContext | None = getattr(dask_worker, attr, None)
+    if mp_ctx is None:
+        raise RuntimeError(f"_reset_worker called before _setup_worker for uid={uid}")
+    assert mp_ctx.comm is not None
+    assert mp_ctx.ctx is not None
+    # Collective: all ranks idle before any rank tears down its Context.
+    if mp_ctx.comm.nranks > 1:
+        barrier(mp_ctx.comm)
+    # Explicit shutdown is thread-affine. ``distributed.worker.run``
+    # dispatches sync work onto the worker's event-loop thread, which is
+    # the same thread that built the Context in ``_setup_worker``.
+    mp_ctx.ctx.shutdown()
+    mp_ctx.ctx = None
+    options = Options.deserialize(rapidsmpf_options_as_bytes)
+    mp_ctx.ctx = Context.from_options(mp_ctx.comm.logger, mp_ctx.mr, options)
+    rmm.mr.set_current_device_resource(mp_ctx.ctx.br().device_mr)
+
+
 def _get_statistics(
     *, clear: bool, uid: str, dask_worker: distributed.Worker | None = None
 ) -> tuple[int, Statistics]:
@@ -563,13 +602,9 @@ def __init__(
             "memory_resource_config", None
         )
 
-        rapidsmpf_options = (
+        rapidsmpf_options_as_bytes = resolve_rapidsmpf_options(
             rapidsmpf_options
-            if rapidsmpf_options is not None
-            else Options(get_environment_variables())
-        )
-        rapidsmpf_options.insert_if_absent({"num_streaming_threads": "4"})
-        rapidsmpf_options_as_bytes = rapidsmpf_options.serialize()
+        ).serialize()
 
         # Unique identifier for this cluster instance; namespaces the per-worker
         # attribute so multiple DaskEngine contexts can coexist on the same workers.
@@ -660,6 +695,55 @@ def __init__(
             engine_options={**engine_options, "memory_resource": None},
         )
 
+    def _reset(
+        self,
+        *,
+        rapidsmpf_options: Options | None = None,
+        executor_options: dict[str, Any] | None = None,
+        engine_options: dict[str, Any] | None = None,
+    ) -> None:
+        """Reset the engine; see :meth:`StreamingEngine._reset` for the contract."""
+        if self._dask_context is None:
+            raise RuntimeError("Cannot reset a shut-down engine")
+        super()._reset(
+            rapidsmpf_options=rapidsmpf_options,
+            executor_options=executor_options,
+            engine_options=engine_options,
+        )
+        executor_options = executor_options or {}
+        engine_options = engine_options or {}
+
+        rapidsmpf_options_as_bytes = resolve_rapidsmpf_options(
+            rapidsmpf_options
+        ).serialize()
+
+        ctx = self._dask_context
+        # Reset all worker Contexts collectively. ``client.run`` blocks
+        # until every worker's reset returns; the per-worker barrier
+        # inside :func:`_reset_worker` synchronizes the teardown across
+        # workers.
+        ctx.client.run(
+            functools.partial(_reset_worker, uid=ctx.rapidsmpf_id),
+            rapidsmpf_options_as_bytes,
+        )
+
+        # Re-run ``StreamingEngine.__init__`` on the existing instance to
+        # reconfigure the polars ``GPUEngine`` layer (``self.config``,
+        # ``self.device``, etc.) with the new options. Pass the existing
+        # ``self._exit_stack`` so any registered callbacks survive.
+        StreamingEngine.__init__(
+            self,
+            nranks=self._nranks,
+            executor_options={
+                **executor_options,
+                "runtime": "rapidsmpf",
+                "cluster": "dask",
+                "dask_context": ctx,
+            },
+            engine_options={**engine_options, "memory_resource": None},
+            exit_stack=self._exit_stack,
+        )
+
     @classmethod
     def from_options(
         cls,
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/ray.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/ray.py
index 47c88249123..1ba92de3e49 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/ray.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/ray.py
@@ -14,10 +14,7 @@
 import ucxx._lib.libucxx as ucx_api
 from rapidsmpf import bootstrap
 from rapidsmpf.communicator.ucxx import barrier, get_root_ucxx_address, new_communicator
-from rapidsmpf.config import (
-    Options,
-    get_environment_variables,
-)
+from rapidsmpf.config import Options
 from rapidsmpf.progress_thread import ProgressThread
 from rapidsmpf.rmm_resource_adaptor import RmmResourceAdaptor
 from rapidsmpf.streaming.core.context import Context
@@ -31,6 +28,7 @@
     StreamingEngine,
     check_reserved_keys,
     evaluate_on_rank,
+    resolve_rapidsmpf_options,
 )
 from cudf_polars.experimental.rapidsmpf.frontend.hardware_binding import (
     HardwareBindingPolicy,
@@ -256,18 +254,6 @@ def reset(self, *, rapidsmpf_options_as_bytes: bytes) -> None:
         """
         Rebuild the streaming Context with new options.
 
-        Keeps the UCXX communicator, the :class:`RmmResourceAdaptor`,
-        and the Python thread-pool executor alive — only the rapidsmpf
-        :class:`Context` is replaced. Used by :meth:`RayEngine._reset`
-        to amortize actor startup and UCX bootstrap costs across engines
-        that differ only in streaming options.
-
-        The RMM resource is *not* rebuilt: UCX maps CUDA IPC buffers
-        against it (notably for pool memory resources) and never
-        releases those mappings during the application lifetime, so a
-        rebuilt MR would silently leak pool memory. Construct a fresh
-        :class:`RayEngine` if you need to swap the memory resource.
-
         Must be called collectively on all actors. A barrier ensures no
         rank tears down its Context while peers may still be using it.
 
@@ -280,7 +266,8 @@ def reset(self, *, rapidsmpf_options_as_bytes: bytes) -> None:
             raise RuntimeError("reset() requires setup_worker() to have run")
         assert self._comm is not None
         # Collective: all ranks idle before any rank tears down its Context.
-        barrier(self._comm)
+        if self._comm.nranks > 1:
+            barrier(self._comm)
         self._ctx.shutdown()
         self._ctx = None
         self._rapidsmpf_options = Options.deserialize(rapidsmpf_options_as_bytes)
@@ -544,13 +531,9 @@ def __init__(
             "memory_resource_config", None
         )
 
-        rapidsmpf_options = (
+        rapidsmpf_options_as_bytes = resolve_rapidsmpf_options(
             rapidsmpf_options
-            if rapidsmpf_options is not None
-            else Options(get_environment_variables())
-        )
-        rapidsmpf_options.insert_if_absent({"num_streaming_threads": "4"})
-        rapidsmpf_options_as_bytes = rapidsmpf_options.serialize()
+        ).serialize()
 
         exit_stack = contextlib.ExitStack()
         if not ray.is_initialized():
@@ -621,73 +604,23 @@ def _reset(
         executor_options: dict[str, Any] | None = None,
         engine_options: dict[str, Any] | None = None,
     ) -> None:
-        """
-        Reset the engine with new options.
-
-        Fast path for consecutive ``RayEngine`` uses that differ only in
-        streaming options. Avoids Ray actor startup and UCX bootstrap.
-
-        Replaces engine state in full, similar to :meth:`__init__`.
-        ``StreamingEngine`` revalidates invariants on each reset, so callers
-        must pass required options (for example, ``allow_gpu_sharing=True``
-        when ``num_ranks > 1``).
-
-        The following inputs are fixed at construction time and cannot change:
-          - ``num_ranks``
-          - ``num_py_executors`` (in ``executor_options``)
-          - ``hardware_binding`` (in ``engine_options``)
-          - ``memory_resource_config`` (in ``engine_options``)
-          - ``ray_init_options``
-
-        Parameters
-        ----------
-        rapidsmpf_options
-            New :class:`Options` for each actor's ``Context``. Defaults to
-            ``Options(get_environment_variables())`` if ``None``.
-        executor_options
-            Polars ``GPUEngine`` executor options. ``None`` is treated as
-            an empty dict.
-        engine_options
-            Polars ``GPUEngine`` options. ``None`` is treated as an empty
-            dict.
-        """
+        """Reset the engine; see :meth:`StreamingEngine._reset` for the contract."""
         if self._rank_actors is None:
             raise RuntimeError("Cannot reset a shut-down engine")
-
+        super()._reset(
+            rapidsmpf_options=rapidsmpf_options,
+            executor_options=executor_options,
+            engine_options=engine_options,
+        )
         executor_options = executor_options or {}
         engine_options = engine_options or {}
-        check_reserved_keys(executor_options, engine_options)
-
-        # Reject keys that cannot be changed.
-        _disallowed_exec = {"num_py_executors"} & executor_options.keys()
-        if _disallowed_exec:
-            raise ValueError(
-                f"executor_options keys {sorted(_disallowed_exec)} cannot be "
-                "changed via _reset(). Construct a fresh RayEngine instead."
-            )
-        _disallowed_engine = {
-            "hardware_binding",
-            "memory_resource_config",
-        } & engine_options.keys()
-        if _disallowed_engine:
-            raise ValueError(
-                f"engine_options keys {sorted(_disallowed_engine)} cannot be "
-                "changed via _reset(). Construct a fresh RayEngine instead."
-            )
-
-        rapidsmpf_options = (
+        rapidsmpf_options_as_bytes = resolve_rapidsmpf_options(
             rapidsmpf_options
-            if rapidsmpf_options is not None
-            else Options(get_environment_variables())
-        )
-        rapidsmpf_options.insert_if_absent({"num_streaming_threads": "4"})
-        rapidsmpf_options_as_bytes = rapidsmpf_options.serialize()
+        ).serialize()
 
         # Reset all actor Contexts collectively. ``ray.get`` blocks until
         # every actor's reset returns; the per-actor barrier inside
         # :meth:`RankActor.reset` synchronizes the teardown across ranks.
-        # The per-actor RMM resource is kept alive across resets — see
-        # :meth:`RankActor.reset`.
         ray.get(
             [
                 rank.reset.remote(
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/spmd.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/spmd.py
index 0f52f83c1a1..65e3eb8b1e7 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/spmd.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/spmd.py
@@ -15,7 +15,7 @@
 from rapidsmpf.communicator.single import (
     new_communicator as single_communicator,
 )
-from rapidsmpf.config import Options, get_environment_variables
+from rapidsmpf.communicator.ucxx import barrier
 from rapidsmpf.integrations.cudf.partition import unpack_and_concat
 from rapidsmpf.memory.packed_data import PackedData
 from rapidsmpf.progress_thread import ProgressThread
@@ -36,6 +36,7 @@
     all_gather_host_data,
     check_reserved_keys,
     evaluate_on_rank,
+    resolve_rapidsmpf_options,
 )
 from cudf_polars.experimental.rapidsmpf.frontend.hardware_binding import (
     HardwareBindingPolicy,
@@ -49,6 +50,7 @@
     from collections.abc import Callable
 
     from rapidsmpf.communicator.communicator import Communicator
+    from rapidsmpf.config import Options
     from rapidsmpf.streaming.cudf.channel_metadata import ChannelMetadata
 
     from cudf_polars.dsl.ir import IR
@@ -341,11 +343,7 @@ def __init__(
         )
         bind_to_gpu(hw_binding)
 
-        rapidsmpf_options = (
-            rapidsmpf_options
-            if rapidsmpf_options is not None
-            else Options(get_environment_variables())
-        )
+        rapidsmpf_options = resolve_rapidsmpf_options(rapidsmpf_options)
         mr_config: MemoryResourceConfig | None = engine_options.get(
             "memory_resource_config", None
         )
@@ -369,17 +367,22 @@ def __init__(
                 )
         # else: caller-provided comm; the caller retains ownership
 
-        py_executor = ThreadPoolExecutor(
+        self._py_executor: ThreadPoolExecutor = ThreadPoolExecutor(
             max_workers=cast(int, executor_options.get("num_py_executors", 8)),
             thread_name_prefix="spmd-executor",
         )
+        self._mr: RmmResourceAdaptor = mr
         exit_stack = contextlib.ExitStack()
         try:
-            exit_stack.callback(py_executor.shutdown, wait=False)
+            exit_stack.callback(self._py_executor.shutdown, wait=False)
             exit_stack.enter_context(set_memory_resource(mr))
-            ctx = exit_stack.enter_context(
-                Context.from_options(comm.logger, mr, rapidsmpf_options)
-            )
+            # ``Context`` is *not* registered as a context manager so that
+            # :meth:`_reset` can swap it mid-life without leaving the
+            # exit-stack holding a stale reference. ``_cleanup_ctx`` is
+            # registered instead — it shuts down whatever ``self._ctx`` is
+            # at engine-shutdown time (i.e. the latest reset's Context).
+            ctx = Context.from_options(comm.logger, mr, rapidsmpf_options)
+            exit_stack.callback(self._cleanup_ctx)
             self._comm: Communicator | None = comm
             self._ctx: Context | None = ctx
             super().__init__(
@@ -389,7 +392,7 @@ def __init__(
                     "runtime": "rapidsmpf",
                     "cluster": "spmd",
                     "spmd_context": SPMDContext(
-                        comm=comm, context=ctx, py_executor=py_executor
+                        comm=comm, context=ctx, py_executor=self._py_executor
                     ),
                 },
                 engine_options={
@@ -402,6 +405,17 @@ def __init__(
             exit_stack.close()
             raise
 
+    def _cleanup_ctx(self) -> None:
+        """
+        Shut down the current ``self._ctx`` if any; called from exit-stack.
+
+        ``Context.shutdown()`` is idempotent on the rapidsmpf C++ side, so this is
+        safe even if a prior ``_reset`` already shut down a now-replaced Context.
+        """
+        if self._ctx is not None:
+            self._ctx.shutdown()
+            self._ctx = None
+
     @classmethod
     def from_options(cls, options: StreamingOptions) -> SPMDEngine:
         """
@@ -436,6 +450,65 @@ def from_options(cls, options: StreamingOptions) -> SPMDEngine:
             engine_options=options.to_engine_options(),
         )
 
+    def _reset(
+        self,
+        *,
+        rapidsmpf_options: Options | None = None,
+        executor_options: dict[str, Any] | None = None,
+        engine_options: dict[str, Any] | None = None,
+    ) -> None:
+        """
+        Reset the engine; see :meth:`StreamingEngine._reset` for the contract.
+
+        Must be called collectively on all ranks. A barrier ensures no
+        rank tears down its Context while peers may still be using it.
+        """
+        if self._ctx is None:
+            raise RuntimeError("Cannot reset a shut-down engine")
+        assert self._comm is not None
+        super()._reset(
+            rapidsmpf_options=rapidsmpf_options,
+            executor_options=executor_options,
+            engine_options=engine_options,
+        )
+        executor_options = executor_options or {}
+        engine_options = engine_options or {}
+        rapidsmpf_options = resolve_rapidsmpf_options(rapidsmpf_options)
+
+        # Collective: synchronize all ranks before tearing down the Context.
+        if self._comm.nranks > 1:
+            barrier(self._comm)
+        # Same-thread shutdown, _reset runs on the thread that built the
+        # Context (the test driver's main thread). The per-engine RMM
+        # resource is kept alive across resets, see :meth:`_cleanup_ctx`.
+        self._ctx.shutdown()
+        self._ctx = Context.from_options(self._comm.logger, self._mr, rapidsmpf_options)
+
+        # Re-run ``StreamingEngine.__init__`` on the existing instance to
+        # reconfigure the polars ``GPUEngine`` layer (``self.config``,
+        # ``self.device``, etc.) with the new options. Pass the existing
+        # ``self._exit_stack`` so any registered callbacks (notably
+        # ``_cleanup_ctx`` and ``set_memory_resource``) survive.
+        StreamingEngine.__init__(
+            self,
+            nranks=self._comm.nranks,
+            executor_options={
+                **executor_options,
+                "runtime": "rapidsmpf",
+                "cluster": "spmd",
+                "spmd_context": SPMDContext(
+                    comm=self._comm,
+                    context=self._ctx,
+                    py_executor=self._py_executor,
+                ),
+            },
+            engine_options={
+                **engine_options,
+                "memory_resource": self._ctx.br().device_mr,
+            },
+            exit_stack=self._exit_stack,
+        )
+
     @property
     def rank(self) -> int:
         """
@@ -536,9 +609,14 @@ def shutdown(self) -> None:
         """
         if self._ctx is None:
             return  # already shut down
+
+        # Order matters: ``super().shutdown()`` closes ``self._exit_stack``,
+        # which invokes ``self._cleanup_ctx``. That requires ``self._ctx`` to
+        # still be set so the rapidsmpf Context can be shut down correctly.
+        # Clear the references only after shutdown completes.
+        super().shutdown()
         self._comm = None
         self._ctx = None
-        super().shutdown()
 
     def _run(self, func: Callable[..., T], *args: Any, **kwargs: Any) -> list[T]:
         data = json.dumps(func(*args, **kwargs)).encode()
diff --git a/python/cudf_polars/cudf_polars/testing/engine_utils.py b/python/cudf_polars/cudf_polars/testing/engine_utils.py
index ec216dc6d88..c36bcf2ed27 100644
--- a/python/cudf_polars/cudf_polars/testing/engine_utils.py
+++ b/python/cudf_polars/cudf_polars/testing/engine_utils.py
@@ -10,7 +10,7 @@
 from typing import TYPE_CHECKING, Any, Literal
 
 if TYPE_CHECKING:
-    from rapidsmpf.communicator.communicator import Communicator
+    from collections.abc import Mapping
 
     import polars as pl
 
@@ -112,39 +112,49 @@ def create_streaming_options(
 
 def build_streaming_engine(
     param: EngineFixtureParam,
-    spmd_comm: Communicator,
+    engines: Mapping[str, StreamingEngine],
     options: StreamingOptions | None = None,
 ) -> StreamingEngine:
     """
-    Build a :class:`StreamingEngine` from an engine fixture parameter.
+    Return ``engines``'s entry for ``param``, ``_reset``-ed.
+
+    ``engines`` must already contain a slot for ``param.engine_name`` —
+    seeded by the ``streaming_engines`` session-scoped fixture. The
+    fixture owns mutation; this function only reads and ``_reset``-s.
 
     Parameters
     ----------
     param
         Decoded engine fixture parameter describing the backend and block size mode.
-    spmd_comm
-        Communicator used when constructing an :class:`SPMDEngine`.
+    engines
+        Streaming-engine collection keyed by backend name. Provided by
+        the ``streaming_engines`` test fixture.
     options
         Optional streaming options to merge on top of the baseline selected by
         ``param.blocksize_mode``.
 
     Returns
     -------
-    A streaming engine matching ``param``.
-    """
-    from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
+    The shared :class:`StreamingEngine`, ``_reset`` to the requested options.
 
+    Raises
+    ------
+    RuntimeError
+        If ``engines`` has no slot for ``param.engine_name``.
+    """
     streaming_options = create_streaming_options(param.blocksize_mode, options)
-    match param.engine_name:
-        case "spmd":
-            return SPMDEngine(
-                comm=spmd_comm,
-                rapidsmpf_options=streaming_options.to_rapidsmpf_options(),
-                executor_options=streaming_options.to_executor_options(),
-                engine_options=streaming_options.to_engine_options(),
-            )
-        case _:  # pragma: no cover
-            raise AssertionError(f"Unknown streaming backend: {param.engine_name!r}")
+    engine = engines.get(param.engine_name)
+    if engine is None:  # pragma: no cover
+        raise RuntimeError(
+            f"No streaming engine for {param.engine_name!r}. The corresponding "
+            "session-scoped fixture must populate the collection before tests run."
+        )
+    engine._reset(
+        rapidsmpf_options=streaming_options.to_rapidsmpf_options(),
+        executor_options=streaming_options.to_executor_options(),
+        engine_options=streaming_options.to_engine_options(),
+    )
+    return engine
 
 
 def get_blocksize_mode(obj: pl.GPUEngine) -> Literal["medium", "small"]:
diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py
index 7ad45c06605..7f00684638f 100644
--- a/python/cudf_polars/tests/conftest.py
+++ b/python/cudf_polars/tests/conftest.py
@@ -18,12 +18,18 @@
 )
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Generator
-
-    from rapidsmpf.communicator.communicator import Communicator
+    from collections.abc import Callable, Generator, Mapping
+    from typing import TypeAlias
 
     from cudf_polars.experimental.rapidsmpf.frontend.core import StreamingEngine
     from cudf_polars.experimental.rapidsmpf.frontend.options import StreamingOptions
+    from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
+
+    # Read-only view over the per-backend streaming engines owned by the
+    # ``streaming_engines`` session fixture. Only that fixture mutates the
+    # underlying dict; consumers (``spmd_engine``, ``streaming_engine_factory``,
+    # ``engine``) only look up by backend name.
+    StreamingEngines: TypeAlias = Mapping[str, StreamingEngine]
 
 
 @pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"], scope="session")
@@ -66,12 +72,12 @@ def _skip_unless_spmd(request: pytest.FixtureRequest) -> None:
 
 
 @pytest.fixture(scope="session")
-def spmd_comm() -> Communicator:
-    """Session-scoped communicator — bootstrapped once and shared across all tests.
+def streaming_engines() -> Generator[StreamingEngines, None, None]:
+    """Return a session-scoped mapping of engine name to engine instance.
 
-    Sharing a single communicator avoids the file-based bootstrap race that can
-    cause hangs when ``create_ucxx_comm()`` is called repeatedly in the same
-    ``rrun`` session (stale barrier files / stale ``ucxx_root_address`` KV entry).
+    The returned :class:`StreamingEngines` is a dict that maps each engine
+    name to a single shared engine instance, which is reused across the entire
+    test session.
     """
     pytest.importorskip("rapidsmpf")
     from rapidsmpf import bootstrap
@@ -79,12 +85,36 @@ def spmd_comm() -> Communicator:
     from rapidsmpf.config import Options, get_environment_variables
     from rapidsmpf.progress_thread import ProgressThread
 
+    from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
+
     if bootstrap.is_running_with_rrun():
-        return bootstrap.create_ucxx_comm(
+        comm = bootstrap.create_ucxx_comm(
             progress_thread=ProgressThread(),
             type=bootstrap.BackendType.AUTO,
         )
-    return single_communicator(Options(get_environment_variables()), ProgressThread())
+    else:
+        comm = single_communicator(
+            Options(get_environment_variables()), ProgressThread()
+        )
+
+    engines: dict[str, StreamingEngine] = {"spmd": SPMDEngine(comm=comm)}
+    try:
+        yield engines
+    finally:
+        while engines:
+            _, engine = engines.popitem()
+            engine.shutdown()
+
+
+@pytest.fixture
+def spmd_engine(streaming_engines: StreamingEngines) -> SPMDEngine:
+    """Return the shared :class:`SPMDEngine` reset to default options."""
+    from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
+
+    engine = streaming_engines["spmd"]
+    assert isinstance(engine, SPMDEngine)
+    engine._reset()
+    return engine
 
 
 @pytest.fixture(params=STREAMING_ENGINE_FIXTURE_PARAMS)
@@ -102,38 +132,29 @@ def _all_engine_param(request: pytest.FixtureRequest) -> EngineFixtureParam:
 @pytest.fixture
 def streaming_engine_factory(
     _streaming_engine_param: EngineFixtureParam,
-    spmd_comm: Communicator,
-) -> Generator[Callable[..., StreamingEngine], None, None]:
+    streaming_engines: StreamingEngines,
+) -> Callable[..., StreamingEngine]:
     """
-    Yield a factory that constructs :class:`StreamingEngine` instances for tests.
-
-    The fixture is parametrized over :data:`STREAMING_ENGINE_FIXTURE_PARAMS`.
-    Created engines are tracked and automatically shut down after the test.
+    Return a factory that yields a shared :class:`StreamingEngine`.
 
     Parameters
     ----------
     _streaming_engine_param
         Parametrized engine descriptor controlling backend and block size mode.
-    spmd_comm
-        Communicator used when constructing SPMD-based engines.
-
-    Yields
-    ------
-    Factory function that creates :class:`StreamingEngine` instances. The
-    factory accepts optional :class:`StreamingOptions`, which are merged on
-    top of the parametrized blocksize baseline.
+    streaming_engines
+        Session-scoped engine collection to look up the shared engine in.
+
+    Returns
+    -------
+    Factory function that returns the shared :class:`StreamingEngine`.
     """
-    engines: list[StreamingEngine] = []
 
     def factory(options: StreamingOptions | None = None) -> StreamingEngine:
-        engine = build_streaming_engine(_streaming_engine_param, spmd_comm, options)
-        engines.append(engine)
-        return engine
-
-    yield factory
+        return build_streaming_engine(
+            _streaming_engine_param, streaming_engines, options
+        )
 
-    for engine in reversed(engines):
-        engine.shutdown()
+    return factory
 
 
 @pytest.fixture
@@ -164,9 +185,9 @@ def streaming_engine(
 def engine(
     request: pytest.FixtureRequest,
     _all_engine_param: EngineFixtureParam,
-) -> Generator[pl.GPUEngine, None, None]:
+) -> pl.GPUEngine:
     """
-    Yield a :class:`polars.GPUEngine` for each engine variant under test.
+    Return a :class:`polars.GPUEngine` for each engine variant under test.
 
     Parameters
     ----------
@@ -176,8 +197,8 @@ def engine(
         Parametrized engine descriptor covering both in-memory and streaming
         variants.
 
-    Yields
-    ------
+    Returns
+    -------
     Engine instance matching the parametrized variant.
 
     Notes
@@ -186,15 +207,10 @@ def engine(
     :func:`streaming_engine` fixture instead.
     """
     if _all_engine_param.engine_name == "in-memory":
-        yield pl.GPUEngine(executor="in-memory", raise_on_fail=True)
-        return
+        return pl.GPUEngine(executor="in-memory", raise_on_fail=True)
 
-    spmd_comm: Communicator = request.getfixturevalue("spmd_comm")
-    engine = build_streaming_engine(_all_engine_param, spmd_comm)
-    try:
-        yield engine
-    finally:
-        engine.shutdown()
+    engines: StreamingEngines = request.getfixturevalue("streaming_engines")
+    return build_streaming_engine(_all_engine_param, engines)
 
 
 @pytest.fixture
diff --git a/python/cudf_polars/tests/experimental/test_all_gather_host_data.py b/python/cudf_polars/tests/experimental/test_all_gather_host_data.py
index aad7b341676..8f09a82c4bd 100644
--- a/python/cudf_polars/tests/experimental/test_all_gather_host_data.py
+++ b/python/cudf_polars/tests/experimental/test_all_gather_host_data.py
@@ -14,7 +14,6 @@
     all_gather_host_data,
 )
 from cudf_polars.experimental.rapidsmpf.frontend.options import StreamingOptions
-from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
 
 pytestmark = pytest.mark.spmd
 
@@ -36,15 +35,14 @@ def _struct(rank: int) -> bytes:
 
 
 @pytest.mark.parametrize("make_data", [_empty, _text, _bytearray, _struct])
-def test_all_gather_host_data(spmd_comm, make_data) -> None:
+def test_all_gather_host_data(spmd_engine, make_data) -> None:
     """Each rank sends rank-specific data; results are correct and ordered."""
-    with SPMDEngine(comm=spmd_comm) as spmd_engine:
-        comm = spmd_engine.comm
-        br = spmd_engine.context.br()
-        result = all_gather_host_data(comm, br, op_id=0, data=make_data(comm.rank))
-        assert len(result) == comm.nranks
-        for i, item in enumerate(result):
-            assert item == bytes(make_data(i))
+    comm = spmd_engine.comm
+    br = spmd_engine.context.br()
+    result = all_gather_host_data(comm, br, op_id=0, data=make_data(comm.rank))
+    assert len(result) == comm.nranks
+    for i, item in enumerate(result):
+        assert item == bytes(make_data(i))
 
 
 def test_gather_cluster_info(streaming_engine) -> None:
diff --git a/python/cudf_polars/tests/experimental/test_allgather.py b/python/cudf_polars/tests/experimental/test_allgather.py
index 514276c6647..52c353044eb 100644
--- a/python/cudf_polars/tests/experimental/test_allgather.py
+++ b/python/cudf_polars/tests/experimental/test_allgather.py
@@ -13,7 +13,6 @@
 import pylibcudf as plc
 
 from cudf_polars.experimental.rapidsmpf.collectives.allgather import AllGatherManager
-from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
 from cudf_polars.experimental.rapidsmpf.utils import allgather_reduce
 
 
@@ -53,9 +52,8 @@ async def _test_allgather(engine) -> None:
     assert col.type().id().value == plc.types.TypeId.INT32.value
 
 
-def test_allgather(spmd_comm) -> None:
-    with SPMDEngine(comm=spmd_comm) as engine:
-        asyncio.run(_test_allgather(engine))
+def test_allgather(spmd_engine) -> None:
+    asyncio.run(_test_allgather(spmd_engine))
 
 
 async def _test_allgather_reduce(engine) -> None:
@@ -72,6 +70,5 @@ async def _test_allgather_reduce(engine) -> None:
     assert results == (10, 20, 30)  # Single rank, so sums are just the local values
 
 
-def test_allgather_reduce(spmd_comm) -> None:
-    with SPMDEngine(comm=spmd_comm) as engine:
-        asyncio.run(_test_allgather_reduce(engine))
+def test_allgather_reduce(spmd_engine) -> None:
+    asyncio.run(_test_allgather_reduce(spmd_engine))
diff --git a/python/cudf_polars/tests/experimental/test_dask.py b/python/cudf_polars/tests/experimental/test_dask.py
index d923edd37cf..5ccdde864ef 100644
--- a/python/cudf_polars/tests/experimental/test_dask.py
+++ b/python/cudf_polars/tests/experimental/test_dask.py
@@ -153,3 +153,97 @@ def test_empty_dataframe(engine: DaskEngine) -> None:
 def test_run(engine: DaskEngine) -> None:
     result = engine._run(os.getpid)
     assert len(set(result)) == engine.nranks
+
+
+@pytest.fixture(scope="module")
+def reset_engine() -> Iterator[DaskEngine]:
+    """Module-scoped engine for reset tests — independent of ``engine``.
+
+    These tests exercise :meth:`DaskEngine._reset` (which mutates the
+    engine in-place). A dedicated fixture keeps those mutations from
+    leaking into the other tests.
+    """
+    with DaskEngine(
+        executor_options={"max_rows_per_partition": 10},
+    ) as e:
+        yield e
+
+
+def test_reset_keeps_workers_alive(reset_engine: DaskEngine) -> None:
+    """``_reset`` must not respawn dask workers."""
+    workers_before = sorted(
+        reset_engine._dask_ctx.client.scheduler_info(n_workers=-1)["workers"]
+    )
+    pids_before = sorted(reset_engine._run(os.getpid))
+
+    reset_engine._reset(executor_options={"max_rows_per_partition": 7})
+
+    workers_after = sorted(
+        reset_engine._dask_ctx.client.scheduler_info(n_workers=-1)["workers"]
+    )
+    pids_after = sorted(reset_engine._run(os.getpid))
+
+    # Same worker addresses …
+    assert workers_before == workers_after
+    # … and the workers are running in the same OS processes.
+    assert pids_before == pids_after
+
+
+def test_reset_updates_executor_options(reset_engine: DaskEngine) -> None:
+    """``_reset`` updates the polars-layer config to the new options."""
+    reset_engine._reset(executor_options={"max_rows_per_partition": 42})
+
+    opts = reset_engine.config["executor_options"]
+    assert opts["max_rows_per_partition"] == 42
+    # Reserved keys are still injected by ``_reset``.
+    assert opts["runtime"] == "rapidsmpf"
+    assert opts["cluster"] == "dask"
+    assert isinstance(opts["dask_context"], DaskContext)
+
+
+def test_reset_collects_after_options_change(reset_engine: DaskEngine) -> None:
+    """The engine still drives a real query after ``_reset``."""
+    reset_engine._reset(executor_options={"max_rows_per_partition": 3})
+    assert_gpu_result_equal(
+        pl.LazyFrame({"a": [1, 2, 3, 4, 5]}),
+        engine=reset_engine,
+        check_row_order=False,
+    )
+
+
+def test_reset_after_shutdown_raises() -> None:
+    """``shutdown`` is idempotent; ``_reset`` after shutdown raises every time."""
+    engine = DaskEngine(executor_options={"max_rows_per_partition": 10})
+    engine.shutdown()
+    engine.shutdown()  # idempotent
+    with pytest.raises(RuntimeError, match="shut-down"):
+        engine._reset()
+    with pytest.raises(RuntimeError, match="shut-down"):
+        engine._reset()  # still raises on a second attempt
+    engine.shutdown()  # still safe after a failed _reset
+
+
+def test_reset_rejects_construction_time_executor_options(
+    reset_engine: DaskEngine,
+) -> None:
+    """``_reset`` rejects ``executor_options`` keys read at worker setup."""
+    with pytest.raises(ValueError, match="num_py_executors"):
+        reset_engine._reset(executor_options={"num_py_executors": 4})
+
+
+def test_reset_rejects_construction_time_engine_options(
+    reset_engine: DaskEngine,
+) -> None:
+    """``_reset`` rejects ``engine_options`` keys read at worker setup."""
+    from cudf_polars.experimental.rapidsmpf.frontend.hardware_binding import (
+        HardwareBindingPolicy,
+    )
+
+    with pytest.raises(ValueError, match="hardware_binding"):
+        reset_engine._reset(
+            engine_options={
+                "hardware_binding": HardwareBindingPolicy(enabled=False),
+            },
+        )
+    with pytest.raises(ValueError, match="memory_resource_config"):
+        reset_engine._reset(engine_options={"memory_resource_config": None})
diff --git a/python/cudf_polars/tests/experimental/test_io_multirank.py b/python/cudf_polars/tests/experimental/test_io_multirank.py
index 631f12fd85c..2208cc67316 100644
--- a/python/cudf_polars/tests/experimental/test_io_multirank.py
+++ b/python/cudf_polars/tests/experimental/test_io_multirank.py
@@ -19,8 +19,6 @@
     from collections.abc import Iterator
     from pathlib import Path
 
-    from rapidsmpf.communicator.communicator import Communicator
-
     from cudf_polars.experimental.rapidsmpf.frontend.core import StreamingEngine
 
 # Runs the spmd variant even under rrun with nranks > 1. The ray/dask
@@ -44,7 +42,7 @@ def df() -> pl.LazyFrame:
 @pytest.fixture(params=["spmd", "ray", "dask"])
 def engine(
     request: pytest.FixtureRequest,
-    spmd_comm: Communicator,
+    spmd_engine: SPMDEngine,
 ) -> Iterator[StreamingEngine]:
     """Yield each supported streaming engine."""
     backend = request.param
@@ -52,7 +50,7 @@ def engine(
 
     if backend == "spmd":
         with SPMDEngine(
-            comm=spmd_comm,
+            comm=spmd_engine.comm,
             executor_options=executor_options,
         ) as eng:
             yield eng
diff --git a/python/cudf_polars/tests/experimental/test_ray.py b/python/cudf_polars/tests/experimental/test_ray.py
index 7365be733b3..ded4903c594 100644
--- a/python/cudf_polars/tests/experimental/test_ray.py
+++ b/python/cudf_polars/tests/experimental/test_ray.py
@@ -275,7 +275,7 @@ def test_reset_collects_after_options_change(reset_engine: RayEngine) -> None:
 
 
 def test_reset_after_shutdown_raises() -> None:
-    """``_reset`` after ``shutdown`` raises ``RuntimeError``."""
+    """``shutdown`` is idempotent; ``_reset`` after shutdown raises every time."""
     engine = RayEngine(
         executor_options={"max_rows_per_partition": 10},
         engine_options={"allow_gpu_sharing": True},
@@ -283,8 +283,12 @@ def test_reset_after_shutdown_raises() -> None:
         ray_init_options={"include_dashboard": False},
     )
     engine.shutdown()
+    engine.shutdown()  # idempotent
     with pytest.raises(RuntimeError, match="shut-down"):
         engine._reset()
+    with pytest.raises(RuntimeError, match="shut-down"):
+        engine._reset()  # still raises on a second attempt
+    engine.shutdown()  # still safe after a failed _reset
 
 
 def test_reset_rejects_construction_time_executor_options(
diff --git a/python/cudf_polars/tests/experimental/test_sink.py b/python/cudf_polars/tests/experimental/test_sink.py
index 9b0573d2cb4..df68b7c199a 100644
--- a/python/cudf_polars/tests/experimental/test_sink.py
+++ b/python/cudf_polars/tests/experimental/test_sink.py
@@ -92,7 +92,7 @@ def test_sink_parquet_directory(
         assert len(list(check_path.iterdir())) == expected_file_count
 
 
-def test_sink_parquet_raises_spmd(spmd_comm):
+def test_sink_parquet_raises_spmd(spmd_engine):
     from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
 
     with (
@@ -100,7 +100,7 @@ def test_sink_parquet_raises_spmd(spmd_comm):
             ValueError, match="The spmd cluster requires sink_to_directory=True"
         ),
         SPMDEngine(
-            comm=spmd_comm, executor_options={"sink_to_directory": False}
+            comm=spmd_engine.comm, executor_options={"sink_to_directory": False}
         ) as engine,
     ):
         ConfigOptions.from_polars_engine(engine)
diff --git a/python/cudf_polars/tests/experimental/test_spilling.py b/python/cudf_polars/tests/experimental/test_spilling.py
index 799d19402e6..6aa11801132 100644
--- a/python/cudf_polars/tests/experimental/test_spilling.py
+++ b/python/cudf_polars/tests/experimental/test_spilling.py
@@ -9,7 +9,6 @@
 
 import numpy as np
 import pytest
-from rapidsmpf.config import Options
 from rapidsmpf.memory.buffer import MemoryType
 from rapidsmpf.memory.pinned_memory_resource import is_pinned_memory_resources_supported
 from rapidsmpf.streaming.core.message import Message
@@ -18,7 +17,7 @@
 
 import pylibcudf as plc
 
-from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
+from cudf_polars.experimental.rapidsmpf.frontend.options import StreamingOptions
 from cudf_polars.experimental.rapidsmpf.utils import (
     make_spill_function,
 )
@@ -51,109 +50,104 @@ def create_test_table(nbytes: int, stream: Stream) -> plc.Table:
     ],
 )
 def test_make_spill_function(
-    spmd_comm,
+    streaming_engine_factory,
     *,
     pinned_memory: bool,
     spilled_host_mem_type: MemoryType,
 ) -> None:
     """Test that spilling prioritizes longest queues and newest messages."""
-    with SPMDEngine(
-        comm=spmd_comm,
-        rapidsmpf_options=Options({"pinned_memory": str(pinned_memory).lower()}),
-    ) as spmd_engine:
-        context = spmd_engine.context
-
-        if spilled_host_mem_type == MemoryType.PINNED_HOST:
-            assert spmd_engine.context.br().pinned_mr is not None
-            other_host_mem_type = MemoryType.HOST
-        else:
-            assert spmd_engine.context.br().pinned_mr is None
-            other_host_mem_type = MemoryType.PINNED_HOST
-
-        # Create 3 spillable message containers simulating fanout buffers
-        # Buffer 0: Fast consumer (2 messages)
-        # Buffer 1: Slow consumer (5 messages) <- should spill from here first
-        # Buffer 2: Medium consumer (3 messages)
-        buffers = [SpillableMessages(context.br()) for _ in range(3)]
-        messages_per_buffer = [2, 5, 3]
-
-        # Track message IDs for each buffer
-        message_ids: dict[int, list[int]] = {}
-
-        # Populate buffers with messages
-        stream = context.get_stream_from_pool()
-        for buffer_idx, (sm, count) in enumerate(
-            zip(buffers, messages_per_buffer, strict=False)
-        ):
-            message_ids[buffer_idx] = []
-            for msg_idx in range(count):
-                # Create 1MB messages
-                table = create_test_table(1024 * 1024, stream)
-                chunk = TableChunk.from_pylibcudf_table(
-                    table, stream, exclusive_view=True, br=context.br()
-                )
-                msg = Message(msg_idx, chunk)
-                mid = sm.insert(msg)
-                message_ids[buffer_idx].append(mid)
-
-        # Register spill function
-        spill_func = make_spill_function(buffers, context)
-        func_id = context.br().spill_manager.add_spill_function(spill_func, priority=0)
-
-        try:
-            # Manually trigger spilling of 3MB
-            # Expected: Buffer 1 (longest) should spill newest messages first
-            amount_to_spill = 3 * 1024 * 1024
-            actual_spilled = context.br().spill_manager.spill(amount_to_spill)
-
-            # Allow some tolerance
-            assert actual_spilled >= amount_to_spill * 0.95
-
-            # Verify Buffer 1 (longest queue): newest 3 messages should be spilled
-            buffer_1_descs = buffers[1].get_content_descriptions()
-            for i in range(3, 5):  # Messages 3, 4 (newest)
-                mid = message_ids[1][i]
-                desc = buffer_1_descs[mid]
-                # Should be in HOST memory (spilled)
-                assert desc.content_sizes[spilled_host_mem_type] > 0
-                assert desc.content_sizes[other_host_mem_type] == 0
-                assert desc.content_sizes[MemoryType.DEVICE] == 0
-
-            # Buffer 1: oldest messages should still be in device
-            for i in range(2):  # Messages 0, 1 (oldest)
-                mid = message_ids[1][i]
-                desc = buffer_1_descs[mid]
-                # Should still be in DEVICE memory
-                assert desc.content_sizes[MemoryType.DEVICE] > 0
-                assert desc.content_sizes[spilled_host_mem_type] == 0
-                assert desc.content_sizes[other_host_mem_type] == 0
-
-            # Buffer 0 (shortest queue): all messages should still be on device
-            buffer_0_descs = buffers[0].get_content_descriptions()
-            for mid in message_ids[0]:
-                desc = buffer_0_descs[mid]
-                assert desc.content_sizes[MemoryType.DEVICE] > 0
-                assert desc.content_sizes[spilled_host_mem_type] == 0
-                assert desc.content_sizes[other_host_mem_type] == 0
-
-            # Verify we can extract and make available a spilled message
-            spilled_mid = message_ids[1][4]  # Newest message from longest queue
-            spilled_msg = buffers[1].extract(mid=spilled_mid)
-
-            chunk = TableChunk.from_message(spilled_msg, br=context.br())
-            assert not chunk.is_available()  # Should be on host
-
-            # Make it available should bring it back to device
-            cost = chunk.make_available_cost()
-            assert cost > 0
-            res, _ = context.br().reserve(
-                MemoryType.DEVICE, cost, allow_overbooking=True
+    engine = streaming_engine_factory(StreamingOptions(pinned_memory=pinned_memory))
+    context = engine.context
+
+    if spilled_host_mem_type == MemoryType.PINNED_HOST:
+        assert engine.context.br().pinned_mr is not None
+        other_host_mem_type = MemoryType.HOST
+    else:
+        assert engine.context.br().pinned_mr is None
+        other_host_mem_type = MemoryType.PINNED_HOST
+
+    # Create 3 spillable message containers simulating fanout buffers
+    # Buffer 0: Fast consumer (2 messages)
+    # Buffer 1: Slow consumer (5 messages) <- should spill from here first
+    # Buffer 2: Medium consumer (3 messages)
+    buffers = [SpillableMessages(context.br()) for _ in range(3)]
+    messages_per_buffer = [2, 5, 3]
+
+    # Track message IDs for each buffer
+    message_ids: dict[int, list[int]] = {}
+
+    # Populate buffers with messages
+    stream = context.get_stream_from_pool()
+    for buffer_idx, (sm, count) in enumerate(
+        zip(buffers, messages_per_buffer, strict=False)
+    ):
+        message_ids[buffer_idx] = []
+        for msg_idx in range(count):
+            # Create 1MB messages
+            table = create_test_table(1024 * 1024, stream)
+            chunk = TableChunk.from_pylibcudf_table(
+                table, stream, exclusive_view=True, br=context.br()
             )
-            chunk_available = chunk.make_available(res)
-
-            assert chunk_available.is_available()
-            # Verify we got a valid table back
-            assert chunk_available.table_view().num_rows() > 0
-
-        finally:
-            context.br().spill_manager.remove_spill_function(func_id)
+            msg = Message(msg_idx, chunk)
+            mid = sm.insert(msg)
+            message_ids[buffer_idx].append(mid)
+
+    # Register spill function
+    spill_func = make_spill_function(buffers, context)
+    func_id = context.br().spill_manager.add_spill_function(spill_func, priority=0)
+
+    try:
+        # Manually trigger spilling of 3MB
+        # Expected: Buffer 1 (longest) should spill newest messages first
+        amount_to_spill = 3 * 1024 * 1024
+        actual_spilled = context.br().spill_manager.spill(amount_to_spill)
+
+        # Allow some tolerance
+        assert actual_spilled >= amount_to_spill * 0.95
+
+        # Verify Buffer 1 (longest queue): newest 3 messages should be spilled
+        buffer_1_descs = buffers[1].get_content_descriptions()
+        for i in range(3, 5):  # Messages 3, 4 (newest)
+            mid = message_ids[1][i]
+            desc = buffer_1_descs[mid]
+            # Should be in HOST memory (spilled)
+            assert desc.content_sizes[spilled_host_mem_type] > 0
+            assert desc.content_sizes[other_host_mem_type] == 0
+            assert desc.content_sizes[MemoryType.DEVICE] == 0
+
+        # Buffer 1: oldest messages should still be in device
+        for i in range(2):  # Messages 0, 1 (oldest)
+            mid = message_ids[1][i]
+            desc = buffer_1_descs[mid]
+            # Should still be in DEVICE memory
+            assert desc.content_sizes[MemoryType.DEVICE] > 0
+            assert desc.content_sizes[spilled_host_mem_type] == 0
+            assert desc.content_sizes[other_host_mem_type] == 0
+
+        # Buffer 0 (shortest queue): all messages should still be on device
+        buffer_0_descs = buffers[0].get_content_descriptions()
+        for mid in message_ids[0]:
+            desc = buffer_0_descs[mid]
+            assert desc.content_sizes[MemoryType.DEVICE] > 0
+            assert desc.content_sizes[spilled_host_mem_type] == 0
+            assert desc.content_sizes[other_host_mem_type] == 0
+
+        # Verify we can extract and make available a spilled message
+        spilled_mid = message_ids[1][4]  # Newest message from longest queue
+        spilled_msg = buffers[1].extract(mid=spilled_mid)
+
+        chunk = TableChunk.from_message(spilled_msg, br=context.br())
+        assert not chunk.is_available()  # Should be on host
+
+        # Make it available should bring it back to device
+        cost = chunk.make_available_cost()
+        assert cost > 0
+        res, _ = context.br().reserve(MemoryType.DEVICE, cost, allow_overbooking=True)
+        chunk_available = chunk.make_available(res)
+
+        assert chunk_available.is_available()
+        # Verify we got a valid table back
+        assert chunk_available.table_view().num_rows() > 0
+
+    finally:
+        context.br().spill_manager.remove_spill_function(func_id)
diff --git a/python/cudf_polars/tests/experimental/test_spmd.py b/python/cudf_polars/tests/experimental/test_spmd.py
index a1970c8e92f..9fef0e00350 100644
--- a/python/cudf_polars/tests/experimental/test_spmd.py
+++ b/python/cudf_polars/tests/experimental/test_spmd.py
@@ -30,12 +30,22 @@
 pytestmark = pytest.mark.spmd
 
 
-def test_yields_context_and_engine(spmd_comm: Communicator) -> None:
+@pytest.fixture
+def comm(spmd_engine: SPMDEngine) -> Communicator:
+    """Communicator from the shared :class:`SPMDEngine` for local construction.
+
+    Most tests in this module need to construct their own
+    :class:`SPMDEngine` to exercise lifecycle, construction-time
+    options, MR-state semantics, or :meth:`SPMDEngine._reset`.
+    """
+    return spmd_engine.comm
+
+
+def test_yields_context_and_engine(spmd_engine: SPMDEngine) -> None:
     """SPMDEngine has comm and context properties."""
-    with SPMDEngine(comm=spmd_comm) as engine:
-        assert engine.comm is not None
-        assert engine.context is not None
-        assert isinstance(engine, pl.GPUEngine)
+    assert spmd_engine.comm is not None
+    assert spmd_engine.context is not None
+    assert isinstance(spmd_engine, pl.GPUEngine)
 
 
 def test_from_options() -> None:
@@ -74,31 +84,29 @@ def test_engine_options_reserved_keys() -> None:
             pass
 
 
-def test_engine_options_parquet_options(spmd_comm: Communicator) -> None:
+def test_engine_options_parquet_options(comm: Communicator) -> None:
     """engine_options forwards parquet_options to GPUEngine without error."""
-    with SPMDEngine(comm=spmd_comm, engine_options={"parquet_options": {}}) as engine:
+    with SPMDEngine(comm=comm, engine_options={"parquet_options": {}}) as engine:
         assert isinstance(engine, pl.GPUEngine)
 
 
-def test_scan(spmd_comm: Communicator) -> None:
+def test_scan(spmd_engine: SPMDEngine) -> None:
     """Each rank scans its own single-row LazyFrame and gets that row back."""
-    with SPMDEngine(comm=spmd_comm) as engine:
-        lf = pl.LazyFrame({"a": [engine.rank], "b": [engine.rank * 10]})
-        result = lf.collect(engine=engine)
-        assert result.shape == (1, 2)
-        assert result["a"].to_list() == [engine.rank]
-        assert result["b"].to_list() == [engine.rank * 10]
+    lf = pl.LazyFrame({"a": [spmd_engine.rank], "b": [spmd_engine.rank * 10]})
+    result = lf.collect(engine=spmd_engine)
+    assert result.shape == (1, 2)
+    assert result["a"].to_list() == [spmd_engine.rank]
+    assert result["b"].to_list() == [spmd_engine.rank * 10]
 
 
-def test_basic_query(spmd_comm: Communicator) -> None:
+def test_basic_query(spmd_engine: SPMDEngine) -> None:
     """A simple in-memory LazyFrame can be collected."""
-    with SPMDEngine(comm=spmd_comm) as engine:
-        result = pl.LazyFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).collect(engine=engine)
+    result = pl.LazyFrame({"a": [1, 2, 3], "b": [4, 5, 6]}).collect(engine=spmd_engine)
     assert result.shape == (3, 2)
     assert result["a"].to_list() == [1, 2, 3]
 
 
-def test_collect_then_lazy_equivalent(spmd_comm: Communicator) -> None:
+def test_collect_then_lazy_equivalent(spmd_engine: SPMDEngine) -> None:
     """collect().lazy() preserves SPMD semantics: an intermediate materialize is a no-op.
 
     In SPMD mode a DataFrame is always rank-local.  When it is wrapped back
@@ -106,111 +114,105 @@ def test_collect_then_lazy_equivalent(spmd_comm: Communicator) -> None:
     re-slicing it across ranks.  So ``lf.collect().lazy().op.collect()`` must
     produce the same result as ``lf.op.collect()``.
     """
-    with SPMDEngine(comm=spmd_comm) as engine:
-        lf = pl.LazyFrame(
-            {"a": [engine.rank, engine.rank + 1, engine.rank + 2], "b": [0, 1, 2]}
-        )
+    rank = spmd_engine.rank
+    lf = pl.LazyFrame({"a": [rank, rank + 1, rank + 2], "b": [0, 1, 2]})
 
-        # One-step
-        one_step = lf.filter(pl.col("b") >= 1).collect(engine=engine)
+    # One-step
+    one_step = lf.filter(pl.col("b") >= 1).collect(engine=spmd_engine)
 
-        # Two-step: materialize then re-wrap
-        intermediate = lf.collect(engine=engine)
-        two_step = intermediate.lazy().filter(pl.col("b") >= 1).collect(engine=engine)
+    # Two-step: materialize then re-wrap
+    intermediate = lf.collect(engine=spmd_engine)
+    two_step = intermediate.lazy().filter(pl.col("b") >= 1).collect(engine=spmd_engine)
 
     assert one_step.sort("a").equals(two_step.sort("a"))
 
 
-def test_group_by(spmd_comm: Communicator) -> None:
+def test_group_by(spmd_engine: SPMDEngine) -> None:
     """Group-by on rank-local data, then allgather to verify the global result."""
-    with SPMDEngine(comm=spmd_comm) as engine:
-        lf = pl.LazyFrame({"a": [engine.rank], "b": [engine.rank * 10]})
-        local_result = lf.group_by("a").agg(pl.col("b").sum()).collect(engine=engine)
-        with reserve_op_id() as op_id:
-            global_result = allgather_polars_dataframe(
-                engine=engine, local_df=local_result, op_id=op_id
-            )
-        assert global_result.shape == (engine.nranks, 2)
-        assert global_result.sort("a")["a"].to_list() == list(range(engine.nranks))
-        assert global_result.sort("a")["b"].to_list() == [
-            r * 10 for r in range(engine.nranks)
-        ]
+    lf = pl.LazyFrame({"a": [spmd_engine.rank], "b": [spmd_engine.rank * 10]})
+    local_result = lf.group_by("a").agg(pl.col("b").sum()).collect(engine=spmd_engine)
+    with reserve_op_id() as op_id:
+        global_result = allgather_polars_dataframe(
+            engine=spmd_engine, local_df=local_result, op_id=op_id
+        )
+    assert global_result.shape == (spmd_engine.nranks, 2)
+    assert global_result.sort("a")["a"].to_list() == list(range(spmd_engine.nranks))
+    assert global_result.sort("a")["b"].to_list() == [
+        r * 10 for r in range(spmd_engine.nranks)
+    ]
 
 
-def test_allgather_polars_dataframe(spmd_comm: Communicator) -> None:
+def test_allgather_polars_dataframe(spmd_engine: SPMDEngine) -> None:
     """allgather_polars_dataframe collects every rank's contribution in rank order."""
-    with SPMDEngine(comm=spmd_comm) as engine:
-        local = pl.DataFrame({"rank": [engine.rank], "val": [engine.rank * 2]})
-        with reserve_op_id() as op_id:
-            result = allgather_polars_dataframe(
-                engine=engine, local_df=local, op_id=op_id
-            )
-        assert result.shape == (engine.nranks, 2)
-        assert result["rank"].to_list() == list(range(engine.nranks))
-        assert result["val"].to_list() == [r * 2 for r in range(engine.nranks)]
+    local = pl.DataFrame({"rank": [spmd_engine.rank], "val": [spmd_engine.rank * 2]})
+    with reserve_op_id() as op_id:
+        result = allgather_polars_dataframe(
+            engine=spmd_engine, local_df=local, op_id=op_id
+        )
+    assert result.shape == (spmd_engine.nranks, 2)
+    assert result["rank"].to_list() == list(range(spmd_engine.nranks))
+    assert result["val"].to_list() == [r * 2 for r in range(spmd_engine.nranks)]
 
 
-def test_num_py_executors(spmd_comm: Communicator) -> None:
+def test_num_py_executors(comm: Communicator) -> None:
     """executor_options forwards num_py_executors to the thread pool."""
     with SPMDEngine(
-        comm=spmd_comm,
+        comm=comm,
         executor_options={"num_py_executors": 2},
     ) as engine:
         result = pl.LazyFrame({"a": [1, 2, 3]}).collect(engine=engine)
     assert result.shape == (3, 1)
 
 
-def test_allgather_polars_dataframe_empty(spmd_comm: Communicator) -> None:
+def test_allgather_polars_dataframe_empty(spmd_engine: SPMDEngine) -> None:
     """allgather handles an empty (zero-row) local DataFrame on every rank."""
-    with SPMDEngine(comm=spmd_comm) as engine:
-        local = pl.DataFrame(
-            {"a": pl.Series([], dtype=pl.Int32), "b": pl.Series([], dtype=pl.Float64)}
+    local = pl.DataFrame(
+        {"a": pl.Series([], dtype=pl.Int32), "b": pl.Series([], dtype=pl.Float64)}
+    )
+    with reserve_op_id() as op_id:
+        result = allgather_polars_dataframe(
+            engine=spmd_engine, local_df=local, op_id=op_id
         )
-        with reserve_op_id() as op_id:
-            result = allgather_polars_dataframe(
-                engine=engine, local_df=local, op_id=op_id
-            )
     assert result.shape == (0, 2)
     assert result.columns == ["a", "b"]
     assert result.dtypes == [pl.Int32, pl.Float64]
 
 
-def test_mr_wrapped_as_current_inside_context(spmd_comm: Communicator) -> None:
+def test_mr_wrapped_as_current_inside_context(comm: Communicator) -> None:
     """Inside SPMDEngine the current device resource is RmmResourceAdaptor."""
-    with SPMDEngine(comm=spmd_comm):
+    with SPMDEngine(comm=comm):
         assert isinstance(rmm.mr.get_current_device_resource(), RmmResourceAdaptor)
 
 
-def test_mr_restored_after_context(spmd_comm: Communicator) -> None:
+def test_mr_restored_after_context(comm: Communicator) -> None:
     """After SPMDEngine exits the original device resource is restored."""
     original = rmm.mr.get_current_device_resource()
-    with SPMDEngine(comm=spmd_comm):
+    with SPMDEngine(comm=comm):
         pass
     assert rmm.mr.get_current_device_resource() is original
 
 
-def test_allgather_polars_dataframe_multi_column(spmd_comm: Communicator) -> None:
+def test_allgather_polars_dataframe_multi_column(spmd_engine: SPMDEngine) -> None:
     """allgather preserves column names, count, and dtypes for multi-column DataFrames."""
-    with SPMDEngine(comm=spmd_comm) as engine:
-        local = pl.DataFrame(
-            {
-                "rank": [engine.rank],
-                "x": [float(engine.rank)],
-                "label": [f"r{engine.rank}"],
-            }
+    local = pl.DataFrame(
+        {
+            "rank": [spmd_engine.rank],
+            "x": [float(spmd_engine.rank)],
+            "label": [f"r{spmd_engine.rank}"],
+        }
+    )
+    with reserve_op_id() as op_id:
+        result = allgather_polars_dataframe(
+            engine=spmd_engine, local_df=local, op_id=op_id
         )
-        with reserve_op_id() as op_id:
-            result = allgather_polars_dataframe(
-                engine=engine, local_df=local, op_id=op_id
-            )
-        assert result.shape == (engine.nranks, 3)
-        assert result.columns == ["rank", "x", "label"]
-        sorted_result = result.sort("rank")
-        assert sorted_result["rank"].to_list() == list(range(engine.nranks))
-        assert sorted_result["x"].to_list() == [float(r) for r in range(engine.nranks)]
-        assert sorted_result["label"].to_list() == [
-            f"r{r}" for r in range(engine.nranks)
-        ]
+    assert result.shape == (spmd_engine.nranks, 3)
+    assert result.columns == ["rank", "x", "label"]
+    sorted_result = result.sort("rank")
+    assert sorted_result["rank"].to_list() == list(range(spmd_engine.nranks))
+    assert sorted_result["x"].to_list() == [float(r) for r in range(spmd_engine.nranks)]
+    assert sorted_result["label"].to_list() == [
+        f"r{r}" for r in range(spmd_engine.nranks)
+    ]
 
 
 # ---------------------------------------------------------------------------
@@ -218,44 +220,44 @@ def test_allgather_polars_dataframe_multi_column(spmd_comm: Communicator) -> Non
 # ---------------------------------------------------------------------------
 
 
-def test_comm_argument_reuses_communicator(spmd_comm: Communicator) -> None:
+def test_comm_argument_reuses_communicator(comm: Communicator) -> None:
     """Passing comm= reuses the communicator across two engine lifetimes."""
-    with SPMDEngine(comm=spmd_comm) as engine1:
+    with SPMDEngine(comm=comm) as engine1:
         nranks = engine1.nranks
         rank = engine1.rank
-    # engine1 is shut down; spmd_comm is still alive
-    with SPMDEngine(comm=spmd_comm) as engine2:
+    # engine1 is shut down; the shared comm is still alive
+    with SPMDEngine(comm=comm) as engine2:
         assert engine2.nranks == nranks
         assert engine2.rank == rank
 
 
-def test_comm_not_closed_after_engine_shutdown(spmd_comm: Communicator) -> None:
+def test_comm_not_closed_after_engine_shutdown(comm: Communicator) -> None:
     """The caller-provided comm survives engine.shutdown()."""
-    with SPMDEngine(comm=spmd_comm):
+    with SPMDEngine(comm=comm):
         pass  # engine.shutdown() is called on __exit__
-    # spmd_comm must still be accessible — not destroyed by engine teardown
-    assert spmd_comm.rank >= 0
+    # comm must still be accessible — not destroyed by engine teardown
+    assert comm.rank >= 0
 
 
-def test_comm_argument_mr_still_wrapped(spmd_comm: Communicator) -> None:
+def test_comm_argument_mr_still_wrapped(comm: Communicator) -> None:
     """MR wrapping still happens even when comm is provided externally."""
-    with SPMDEngine(comm=spmd_comm):
+    with SPMDEngine(comm=comm):
         assert isinstance(rmm.mr.get_current_device_resource(), RmmResourceAdaptor)
 
 
-def test_comm_sequential_queries(spmd_comm: Communicator) -> None:
+def test_comm_sequential_queries(comm: Communicator) -> None:
     """Two engines sharing a comm can each execute a query without interference."""
-    with SPMDEngine(comm=spmd_comm) as engine:
+    with SPMDEngine(comm=comm) as engine:
         r1 = pl.LazyFrame({"a": [1, 2]}).collect(engine=engine)
-    with SPMDEngine(comm=spmd_comm) as engine:
+    with SPMDEngine(comm=comm) as engine:
         r2 = pl.LazyFrame({"a": [3, 4]}).collect(engine=engine)
     assert r1["a"].to_list() == [1, 2]
     assert r2["a"].to_list() == [3, 4]
 
 
-def test_shutdown_idempotent(spmd_comm: Communicator) -> None:
+def test_shutdown_idempotent(comm: Communicator) -> None:
     """Calling shutdown() twice does not raise."""
-    engine = SPMDEngine(comm=spmd_comm)
+    engine = SPMDEngine(comm=comm)
     engine.shutdown()
     engine.shutdown()
 
@@ -277,9 +279,9 @@ def test_memory_resource_config() -> None:
         mock_create.assert_called_once()
 
 
-def test_comm_and_context_unavailable_after_shutdown(spmd_comm: Communicator) -> None:
+def test_comm_and_context_unavailable_after_shutdown(comm: Communicator) -> None:
     """Accessing comm or context after shutdown raises RuntimeError."""
-    engine = SPMDEngine(comm=spmd_comm)
+    engine = SPMDEngine(comm=comm)
     engine.shutdown()
     with pytest.raises(RuntimeError, match="shutdown"):
         _ = engine.comm
@@ -287,8 +289,89 @@ def test_comm_and_context_unavailable_after_shutdown(spmd_comm: Communicator) ->
         _ = engine.context
 
 
-def test_run(spmd_comm):
-    with SPMDEngine(comm=spmd_comm) as engine:
-        result = engine._run(os.getpid)
-
+def test_run(spmd_engine: SPMDEngine) -> None:
+    result = spmd_engine._run(os.getpid)
     assert result == [os.getpid()]
+
+
+def test_reset_keeps_comm_alive(comm: Communicator) -> None:
+    """``_reset`` must not rebuild the communicator."""
+    with SPMDEngine(
+        comm=comm, executor_options={"max_rows_per_partition": 10}
+    ) as engine:
+        comm_before = engine.comm
+        engine._reset(executor_options={"max_rows_per_partition": 7})
+        # Same Communicator instance — caller-provided comm is preserved.
+        assert engine.comm is comm_before
+        # Engine still drives a real query.
+        result = pl.LazyFrame({"a": [1, 2, 3]}).collect(engine=engine)
+        assert sorted(result["a"].to_list()) == [1, 2, 3]
+
+
+def test_reset_updates_executor_options(comm: Communicator) -> None:
+    """``_reset`` updates the polars-layer config to the new options."""
+    from cudf_polars.utils.config import SPMDContext
+
+    with SPMDEngine(
+        comm=comm, executor_options={"max_rows_per_partition": 10}
+    ) as engine:
+        engine._reset(executor_options={"max_rows_per_partition": 42})
+
+        opts = engine.config["executor_options"]
+        assert opts["max_rows_per_partition"] == 42
+        # Reserved keys are still injected by ``_reset``.
+        assert opts["runtime"] == "rapidsmpf"
+        assert opts["cluster"] == "spmd"
+        assert isinstance(opts["spmd_context"], SPMDContext)
+
+
+def test_reset_collects_after_options_change(comm: Communicator) -> None:
+    """The engine still drives a real query after ``_reset``."""
+    with SPMDEngine(
+        comm=comm, executor_options={"max_rows_per_partition": 10}
+    ) as engine:
+        engine._reset(executor_options={"max_rows_per_partition": 3})
+        result = pl.LazyFrame({"a": [1, 2, 3, 4, 5]}).collect(engine=engine)
+        assert sorted(result["a"].to_list()) == [1, 2, 3, 4, 5]
+
+
+def test_reset_after_shutdown_raises(comm: Communicator) -> None:
+    """``shutdown`` is idempotent; ``_reset`` after shutdown raises every time."""
+    engine = SPMDEngine(comm=comm)
+    engine.shutdown()
+    engine.shutdown()  # idempotent
+    with pytest.raises(RuntimeError, match="shut-down"):
+        engine._reset()
+    with pytest.raises(RuntimeError, match="shut-down"):
+        engine._reset()  # still raises on a second attempt
+    engine.shutdown()  # still safe after a failed _reset
+
+
+def test_reset_rejects_construction_time_executor_options(
+    comm: Communicator,
+) -> None:
+    """``_reset`` rejects ``executor_options`` keys read at engine construction."""
+    with (
+        SPMDEngine(comm=comm) as engine,
+        pytest.raises(ValueError, match="num_py_executors"),
+    ):
+        engine._reset(executor_options={"num_py_executors": 4})
+
+
+def test_reset_rejects_construction_time_engine_options(
+    comm: Communicator,
+) -> None:
+    """``_reset`` rejects ``engine_options`` keys read at engine construction."""
+    from cudf_polars.experimental.rapidsmpf.frontend.hardware_binding import (
+        HardwareBindingPolicy,
+    )
+
+    with SPMDEngine(comm=comm) as engine:
+        with pytest.raises(ValueError, match="hardware_binding"):
+            engine._reset(
+                engine_options={
+                    "hardware_binding": HardwareBindingPolicy(enabled=False),
+                },
+            )
+        with pytest.raises(ValueError, match="memory_resource_config"):
+            engine._reset(engine_options={"memory_resource_config": None})
diff --git a/python/cudf_polars/tests/experimental/test_statistics.py b/python/cudf_polars/tests/experimental/test_statistics.py
index 965449b80f0..82c121d5830 100644
--- a/python/cudf_polars/tests/experimental/test_statistics.py
+++ b/python/cudf_polars/tests/experimental/test_statistics.py
@@ -16,8 +16,6 @@
 if TYPE_CHECKING:
     from collections.abc import Iterator
 
-    from rapidsmpf.communicator.communicator import Communicator
-
     from cudf_polars.experimental.rapidsmpf.frontend.core import StreamingEngine
 
 # Runs the spmd variant even under rrun with nranks > 1. The ray/dask
@@ -30,7 +28,7 @@
 @pytest.fixture(params=["spmd", "ray", "dask"])
 def engine(
     request: pytest.FixtureRequest,
-    spmd_comm: Communicator,
+    spmd_engine: SPMDEngine,
 ) -> Iterator[StreamingEngine]:
     """Yield each supported streaming engine with statistics enabled."""
     backend = request.param
@@ -39,7 +37,7 @@ def engine(
 
     if backend == "spmd":
         with SPMDEngine(
-            comm=spmd_comm,
+            comm=spmd_engine.comm,
             rapidsmpf_options=rapidsmpf_options,
             executor_options=executor_options,
         ) as engine:

From 8bdabe74cfd6e7b59ee6ff48b3c48ac3095df71d Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 6 May 2026 05:30:14 -0500
Subject: [PATCH 16/36] Validate PDS-DS Q1 (#22389)

Workaround for sum of nulls discrepancy between SQL and Polars.
- Closes https://github.com/rapidsai/cudf/issues/22123

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/22389
---
 .../experimental/benchmarks/pdsds_queries/q1.py          | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q1.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q1.py
index c4b8b7ec740..81fd42ea30e 100644
--- a/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q1.py
+++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/pdsds_queries/q1.py
@@ -74,7 +74,14 @@ def polars_impl(run_config: RunConfig) -> QueryResult:
         )
         .filter(pl.col("d_year") == year)
         .group_by(["sr_customer_sk", "sr_store_sk"])
-        .agg(pl.col("sr_return_amt").sum().alias("ctr_total_return"))
+        .agg(
+            # Polars sum() returns 0 for all-null groups; SQL returns NULL.
+            # See https://github.com/rapidsai/cudf/issues/19560.
+            pl.when(pl.col("sr_return_amt").count() > 0)
+            .then(pl.col("sr_return_amt").sum())
+            .otherwise(None)
+            .alias("ctr_total_return")
+        )
         .rename(
             {
                 "sr_customer_sk": "ctr_customer_sk",

From e304ffdd535ca2f5772ed25c5f80f46ca4f31d01 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 6 May 2026 07:39:44 -0500
Subject: [PATCH 17/36] Improve hstack lowering (#22353)

- This is a follow-up to https://github.com/rapidsai/cudf/pull/21796
- This (hopefully) simplifies some code in https://github.com/rapidsai/cudf/pull/22191

**Problem statement**: We currently translate `HStack` nodes with non-pointwise expressions to the equivalent `Select` node at lowering time. This is because all our non-pointwise `Expr`-decomposition logic is specific to `Select`. Before this PR, this translation was skipped whenever the underlying `HStack` was completely overwriting it's original columns. The problem with this case is that we loose "anchor" columns that tell the `Select` how to broadcast scalar-aggregation results.

**Proposed solution**: We add a temporary "anchor" column to the translated `HStack` so that broadcasting works correctly in the `Select` node.

**Motivation**:
- We can handle all `over()` expression decomposition within `Select` if we know **all** non-pointwise HStack operations are lowered to `Select` anyway.
- We don't "fall back" for other non-`over` `HStack` corner cases either.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/22353
---
 .../cudf_polars/experimental/parallel.py      | 58 ++++++++++++++-----
 .../cudf_polars/testing/inject_gpu_engine.py  | 11 ++++
 .../tests/experimental/test_hstack.py         |  2 +-
 3 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py
index 108d7822d60..f77e923bce0 100644
--- a/python/cudf_polars/cudf_polars/experimental/parallel.py
+++ b/python/cudf_polars/cudf_polars/experimental/parallel.py
@@ -9,6 +9,8 @@
 from functools import partial, reduce
 from typing import TYPE_CHECKING, Any
 
+import polars as pl
+
 import cudf_polars.experimental.distinct
 import cudf_polars.experimental.groupby
 import cudf_polars.experimental.io
@@ -16,7 +18,8 @@
 import cudf_polars.experimental.select
 import cudf_polars.experimental.shuffle
 import cudf_polars.experimental.sort  # noqa: F401
-from cudf_polars.dsl.expr import Col, NamedExpr
+from cudf_polars.containers import DataType
+from cudf_polars.dsl.expr import Col, Literal, NamedExpr
 from cudf_polars.dsl.ir import (
     IR,
     Cache,
@@ -25,11 +28,13 @@
     HStack,
     IRExecutionContext,
     MapFunction,
+    Projection,
     Select,
     Slice,
     Union,
 )
 from cudf_polars.dsl.traversal import CachingVisitor, traversal
+from cudf_polars.dsl.utils.naming import unique_names
 from cudf_polars.experimental.base import PartitionInfo, get_key_name
 from cudf_polars.experimental.dispatch import (
     generate_ir_tasks,
@@ -49,8 +54,6 @@
     from collections.abc import MutableMapping
     from typing import Any
 
-    import polars as pl
-
     from cudf_polars.experimental.base import StatsCollector
     from cudf_polars.experimental.dispatch import LowerIRTransformer, State
     from cudf_polars.utils.config import ConfigOptions, StreamingExecutor
@@ -397,6 +400,20 @@ def _(
     )
 
 
+def _add_anchor_column(ir: HStack) -> tuple[HStack, str, DataType]:
+    """Add temporary anchor column to preserve row count."""
+    anchor_name = next(unique_names((*ir.schema, *ir.children[0].schema)))
+    anchor_dtype = DataType(pl.datatypes.Int8())
+    anchor_named_expr = NamedExpr(anchor_name, Literal(anchor_dtype, 0))
+    new_ir = HStack(
+        ir.children[0].schema | {anchor_name: anchor_dtype},
+        (anchor_named_expr,),
+        True,  # noqa: FBT003
+        ir.children[0],
+    )
+    return new_ir, anchor_name, anchor_dtype
+
+
 @lower_ir_node.register(HStack)
 def _(
     ir: HStack, rec: LowerIRTransformer
@@ -404,20 +421,29 @@ def _(
     if not all(e.is_pointwise for e in traversal([ne.value for ne in ir.columns])):
         # Redirect non-pointwise HStack to Select so the Select handler can
         # attempt decomposition (or fall back gracefully via decompose_select).
+        child: IR = ir.children[0]
+        anchor_name: str | None = None
         col_map = {ne.name: ne for ne in ir.columns}
-        has_passthrough = any(name not in col_map for name in ir.schema)
-        if has_passthrough or not ir.should_broadcast:
-            exprs = tuple(
-                col_map[name] if name in col_map else NamedExpr(name, Col(dtype, name))
-                for name, dtype in ir.schema.items()
-            )
-            return lower_ir_node(
-                Select(ir.schema, exprs, ir.should_broadcast, ir.children[0]),
-                rec,
-            )
-        # All output columns are aggregations: no N-row passthrough to anchor
-        # broadcast. Fall back so HStack.do_evaluate uses target_length=child.num_rows.
-        return _lower_ir_fallback(ir, rec)
+        schema = ir.schema
+        if ir.should_broadcast and all(name in col_map for name in ir.schema):
+            # We need to add a temporary anchor column to preserve row count.
+            child, anchor_name, anchor_dtype = _add_anchor_column(ir)
+
+            schema = ir.schema | {anchor_name: anchor_dtype}
+        exprs = tuple(
+            col_map[name] if name in col_map else NamedExpr(name, Col(dtype, name))
+            for name, dtype in schema.items()
+        )
+        new_ir: Select | Projection = Select(schema, exprs, ir.should_broadcast, child)
+        if anchor_name is not None:
+            # Need to drop the temporary anchor column.
+            schema = {
+                name: dtype
+                for name, dtype in new_ir.schema.items()
+                if name != anchor_name
+            }
+            new_ir = Projection(schema, new_ir)
+        return lower_ir_node(new_ir, rec)
 
     child, partition_info = rec(ir.children[0])
     new_node = ir.reconstruct([child])
diff --git a/python/cudf_polars/cudf_polars/testing/inject_gpu_engine.py b/python/cudf_polars/cudf_polars/testing/inject_gpu_engine.py
index 1cad7acec22..6fe2de4d154 100644
--- a/python/cudf_polars/cudf_polars/testing/inject_gpu_engine.py
+++ b/python/cudf_polars/cudf_polars/testing/inject_gpu_engine.py
@@ -334,9 +334,20 @@ def pytest_report_header(config: pytest.Config) -> str:
     "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[glob-parquet-sync]": "Too slow with --inject-gpu-engine-blocksize=small",
     "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[single-parquet-async]": "Too slow with --inject-gpu-engine-blocksize=small",
     "tests/unit/io/test_scan.py::test_scan_with_row_index_projected_out[single-parquet-sync]": "Too slow with --inject-gpu-engine-blocksize=small",
+    "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs0-True-None]": "Too slow with --inject-gpu-engine-blocksize=small",
+    "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs1-True-None]": "Too slow with --inject-gpu-engine-blocksize=small",
     "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs2-True-unordered_columns2]": "Too slow with --inject-gpu-engine-blocksize=small",
     "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs3-True-None]": "Too slow with --inject-gpu-engine-blocksize=small",
+    "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs4-True-None]": "Too slow with --inject-gpu-engine-blocksize=small",
+    "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs5-True-unordered_columns5]": "Too slow with --inject-gpu-engine-blocksize=small",
+    "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs6-False-unordered_columns6]": "Too slow with --inject-gpu-engine-blocksize=small",
+    "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs7-False-None]": "Too slow with --inject-gpu-engine-blocksize=small",
+    "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs8-False-None]": "Too slow with --inject-gpu-engine-blocksize=small",
     "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs9-True-unordered_columns9]": "Too slow with --inject-gpu-engine-blocksize=small",
+    "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs10-True-unordered_columns10]": "Too slow with --inject-gpu-engine-blocksize=small",
+    "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs11-False-unordered_columns11]": "Too slow with --inject-gpu-engine-blocksize=small",
+    "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs12-False-None]": "Too slow with --inject-gpu-engine-blocksize=small",
+    "tests/unit/lazyframe/test_order_observability.py::test_with_columns_sensitivity[exprs13-False-None]": "Too slow with --inject-gpu-engine-blocksize=small",
     "tests/unit/lazyframe/test_optimizations.py::test_collapse_joins_combinations": "Too slow for CI",
     "tests/unit/operations/test_slice.py::test_slice_slice_pushdown": "Too slow with --inject-gpu-engine-blocksize=small",
     "tests/unit/operations/test_group_by.py::test_group_by_first_last_big[Int32-10432-False]": "Too slow with --inject-gpu-engine-blocksize=small",
diff --git a/python/cudf_polars/tests/experimental/test_hstack.py b/python/cudf_polars/tests/experimental/test_hstack.py
index 17dede9dddc..9bbb4b7aa33 100644
--- a/python/cudf_polars/tests/experimental/test_hstack.py
+++ b/python/cudf_polars/tests/experimental/test_hstack.py
@@ -105,7 +105,7 @@ def test_hstack_non_pointwise_redirect_covers_parallel_hstack_handler(engine):
 
 def test_with_columns_scalar_upstream_20981(engine):
     # Based on upstream-Polars unit test.
-    lf = pl.LazyFrame({"a": [1.0, 2.0, 3.0]})
+    lf = pl.LazyFrame({"a": [1.0, 2.0, 3.0, 4.0, 5.0]})
     q = lf.with_columns(pl.col.a.mean())
     assert_gpu_result_equal(q, engine=engine)
 

From 9edc7dcaa118e151fcaebdb17010d446d53f99b3 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Wed, 6 May 2026 09:12:57 -0400
Subject: [PATCH 18/36] Replace `LD_PRELOAD` hack with compute-sanitizer
 (#22290)

We were previously swapping out cudart symbols using `LD_PRELOAD` and `dlsym()`. Adopt a more robust approach that uses the compute-sanitizer library (https://docs.nvidia.com/compute-sanitizer/) instead. This will also allow us to switch to static cudart, contributing to https://github.com/rapidsai/build-planning/issues/235.

This also vendors `FindCUDAToolkit.cmake` from CMake as of a83b2de6.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/22290
---
 .github/workflows/pr.yaml                     |   50 -
 .github/workflows/test.yaml                   |   45 -
 .pre-commit-config.yaml                       |   12 +-
 ci/build_streams.sh                           |   50 -
 ci/test_streams.sh                            |   35 -
 conda/recipes/libcudf/recipe.yaml             |   15 +-
 cpp/CMakeLists.txt                            |   16 +-
 cpp/cmake/Modules/FindCUDAToolkit.cmake       | 1567 +++++++++++++++++
 cpp/tests/utilities/identify_stream_usage.cpp |  340 ++--
 dependencies.yaml                             |    8 +-
 python/pylibcudf/tests/conftest.py            |   15 +-
 .../pylibcudf/tests/test_column_from_array.py |   23 +-
 python/pylibcudf/tests/test_interop.py        |    5 +-
 python/pylibcudf/tests/test_reshape.py        |    9 +-
 14 files changed, 1752 insertions(+), 438 deletions(-)
 delete mode 100755 ci/build_streams.sh
 delete mode 100755 ci/test_streams.sh
 create mode 100644 cpp/cmake/Modules/FindCUDAToolkit.cmake

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 4705a1e10c7..c20f7f7ea79 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -42,10 +42,6 @@ jobs:
       # - narwhals-tests
       - telemetry-setup
       - third-party-integration-tests-cudf-pandas
-      - streams-build-matrix
-      - streams-build
-      - streams-test-matrix
-      - streams-test
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@main
     if: always()
@@ -614,52 +610,6 @@ jobs:
     needs: changed-files
     uses: ./.github/workflows/spark-rapids-jni.yaml
     if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_java
-  streams-build-matrix:
-    needs: [checks, changed-files]
-    uses: rapidsai/shared-workflows/.github/workflows/compute-matrix.yaml@main
-    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
-    with:
-      build_type: pull-request
-      matrix_name: conda-cpp-build
-      matrix_filter: map(select(.ARCH == "amd64"))
-  streams-build:
-    needs: streams-build-matrix
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
-    strategy:
-      fail-fast: false
-      matrix: ${{ fromJSON(needs.streams-build-matrix.outputs.matrix) }}
-    with:
-      build_type: pull-request
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
-      node_type: cpu8
-      container_image: "rapidsai/ci-conda:26.06-cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}"
-      script: ci/build_streams.sh
-      artifact-name: stream_tests_${{ matrix.CUDA_VER }}
-      file_to_upload: cpp/install
-  streams-test-matrix:
-    needs: streams-build
-    uses: rapidsai/shared-workflows/.github/workflows/compute-matrix.yaml@main
-    with:
-      build_type: pull-request
-      matrix_name: conda-cpp-tests
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-  streams-test:
-    needs: streams-test-matrix
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
-    strategy:
-      fail-fast: false
-      matrix: ${{ fromJSON(needs.streams-test-matrix.outputs.matrix) }}
-    with:
-      build_type: pull-request
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
-      node_type: gpu-l4-latest-1
-      container_image: "rapidsai/ci-conda:26.06-cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}"
-      script: ci/test_streams.sh stream_tests_${{ matrix.CUDA_VER }}
 
   telemetry-summarize:
     # This job must use a self-hosted runner to record telemetry traces.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 385acf09227..95439e65744 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -186,48 +186,3 @@ jobs:
       node_type: "gpu-l4-latest-1"
       container_image: "rapidsai/ci-conda:26.06-latest"
       script: ci/test_narwhals.sh
-  streams-build-matrix:
-    uses: rapidsai/shared-workflows/.github/workflows/compute-matrix.yaml@main
-    with:
-      build_type: ${{ inputs.build_type || 'branch' }}
-      matrix_name: conda-cpp-build
-      matrix_filter: map(select(.ARCH == "amd64"))
-  streams-build:
-    needs: streams-build-matrix
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
-    strategy:
-      fail-fast: false
-      matrix: ${{ fromJSON(needs.streams-build-matrix.outputs.matrix) }}
-    with:
-      build_type: ${{ inputs.build_type || 'branch' }}
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
-      node_type: cpu8
-      container_image: "rapidsai/ci-conda:26.06-cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}"
-      script: ci/build_streams.sh
-      artifact-name: stream_tests_${{ matrix.CUDA_VER }}
-      file_to_upload: cpp/install
-  streams-test-matrix:
-    needs: streams-build
-    uses: rapidsai/shared-workflows/.github/workflows/compute-matrix.yaml@main
-    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_cpp
-    with:
-      build_type: pull-request
-      matrix_name: conda-cpp-tests
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-  streams-test:
-    needs: streams-test-matrix
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
-    strategy:
-      fail-fast: false
-      matrix: ${{ fromJSON(needs.streams-test-matrix.outputs.matrix) }}
-    with:
-      build_type: ${{ inputs.build_type || 'branch' }}
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
-      node_type: gpu-l4-latest-1
-      container_image: "rapidsai/ci-conda:26.06-cuda${{ matrix.CUDA_VER }}-${{ matrix.LINUX_VER }}-py${{ matrix.PY_VER }}"
-      script: RAPIDS_BUILD_WORKFLOW_NAME=test.yaml ci/test_streams.sh stream_tests_${{ matrix.CUDA_VER }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a420fde44b0..1fb05425bd3 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -146,6 +146,10 @@ repos:
         entry: ./cpp/scripts/run-cmake-format.sh cmake-format
         language: python
         types: [cmake]
+        # TODO: Remove FindCUDAToolkit once we require CMake 4.4
+        exclude: |
+          (?x)
+            ^cpp/cmake/Modules/FindCUDAToolkit[.]cmake$
         # Note that pre-commit autoupdate does not update the versions
         # of dependencies, so we'll have to update this manually.
         additional_dependencies:
@@ -157,6 +161,10 @@ repos:
         entry: ./cpp/scripts/run-cmake-format.sh cmake-lint
         language: python
         types: [cmake]
+        # TODO: Remove FindCUDAToolkit once we require CMake 4.4
+        exclude: |
+          (?x)
+            ^cpp/cmake/Modules/FindCUDAToolkit[.]cmake$
         # Note that pre-commit autoupdate does not update the versions
         # of dependencies, so we'll have to update this manually.
         additional_dependencies:
@@ -213,6 +221,7 @@ repos:
             pytest[.]ini$|
             ^[.]pre-commit-config[.]yaml$|
             Makefile$
+        # TODO: Remove FindCUDAToolkit once we require CMake 4.4
         exclude: |
           (?x)^(
             cpp/include/cudf_test/cxxopts[.]hpp$|
@@ -226,7 +235,8 @@ repos:
             cpp/src/io/comp/unbz2[.]hpp$|
             cpp/src/io/comp/gpuinflate[.]cu$|
             cpp/src/io/utilities/base64_utilities[.]cpp$|
-            cpp/src/io/utilities/base64_utilities[.]hpp$
+            cpp/src/io/utilities/base64_utilities[.]hpp$|
+            cpp/cmake/Modules/FindCUDAToolkit[.]cmake$
           )
       - id: verify-copyright
         name: verify-copyright-brotli
diff --git a/ci/build_streams.sh b/ci/build_streams.sh
deleted file mode 100755
index 46eaf2f1385..00000000000
--- a/ci/build_streams.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
-# SPDX-License-Identifier: Apache-2.0
-
-set -euo pipefail
-
-rapids-logger "Create test conda environment"
-. /opt/conda/etc/profile.d/conda.sh
-
-rapids-logger "Generate C++ testing dependencies"
-
-ENV_YAML_DIR="$(mktemp -d)"
-
-rapids-dependency-file-generator \
-  --output conda \
-  --file-key stream_tests \
-  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"
-
-rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n stream_tests
-
-# Temporarily allow unbound variables for conda activation.
-set +u
-conda activate stream_tests
-set -u
-
-RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
-export RAPIDS_CUDA_MAJOR
-
-source rapids-configure-sccache
-
-SCCACHE_S3_KEY_PREFIX="cudf-streams/$(arch)/cuda${RAPIDS_CUDA_MAJOR}/objects-cache"
-SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX="cudf-streams/$(arch)/cuda${RAPIDS_CUDA_MAJOR}/preprocessor-cache"
-SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE=true
-export SCCACHE_S3_KEY_PREFIX SCCACHE_S3_PREPROCESSOR_CACHE_KEY_PREFIX SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE
-
-rapids-print-env
-
-rapids-logger "Run C++ build"
-
-cmake -S cpp -B cpp/build -GNinja \
-  -DCUDA_STATIC_RUNTIME=OFF \
-  -DCUDF_BUILD_STREAMS_TEST_UTIL=ON \
-  -DBUILD_SHARED_LIBS=ON
-mkdir cpp/install
-cmake --build cpp/build "-j${PARALLEL_LEVEL}"
-cmake --install cpp/build --prefix cpp/install
-cmake --install cpp/build --prefix cpp/install --component testing
-
-sccache --show-adv-stats
-sccache --stop-server >/dev/null 2>&1 || true
diff --git a/ci/test_streams.sh b/ci/test_streams.sh
deleted file mode 100755
index e325dd1d049..00000000000
--- a/ci/test_streams.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
-# SPDX-License-Identifier: Apache-2.0
-
-set -euo pipefail
-
-readonly artifact_name="$1"
-
-rapids-logger "Create test conda environment"
-. /opt/conda/etc/profile.d/conda.sh
-
-rapids-logger "Download stream test artifacts"
-STREAM_TESTS="$(rapids-download-from-github "$artifact_name")"
-
-rapids-logger "Generate C++ testing dependencies"
-
-ENV_YAML_DIR="$(mktemp -d)"
-
-rapids-dependency-file-generator \
-  --output conda \
-  --file-key stream_tests \
-  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"
-
-rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n stream_tests
-
-# Temporarily allow unbound variables for conda activation.
-set +u
-conda activate stream_tests
-set -u
-
-rapids-print-env
-
-rapids-logger "Run C++ tests"
-
-ctest --test-dir "${STREAM_TESTS}/bin/gtests/libcudf" --output-on-failure
diff --git a/conda/recipes/libcudf/recipe.yaml b/conda/recipes/libcudf/recipe.yaml
index 67f921d9a67..c2e3fff245b 100644
--- a/conda/recipes/libcudf/recipe.yaml
+++ b/conda/recipes/libcudf/recipe.yaml
@@ -32,10 +32,15 @@ cache:
         cudf_ROOT="$(realpath ./cpp/build)"
         export cudf_ROOT
 
+        cmake_args=
+        for arg in $CMAKE_ARGS; do
+          cmake_args="$cmake_args \"$arg\""
+        done
+
         ./build.sh -n -v \
             libcudf libcudf_kafka benchmarks tests \
             --build_metrics --incl_cache_stats --allgpuarch \
-            --cmake-args=\"-DCUDF_ENABLE_ARROW_S3=ON\"
+            --cmake-args="\"-DCUDF_ENABLE_ARROW_S3=ON\" $cmake_args"
       secrets:
         - AWS_ACCESS_KEY_ID
         - AWS_SECRET_ACCESS_KEY
@@ -79,8 +84,10 @@ cache:
     host:
       - librmm =${{ minor_version }}
       - libkvikio =${{ minor_version }}
+      - cuda-cudart-dev
       - cuda-nvrtc-dev
       - cuda-nvtx-dev
+      - cuda-sanitizer-api
       - libcurand-dev
       - libnvjitlink-dev
       - if: linux and x86_64
@@ -119,6 +126,7 @@ outputs:
         - ${{ compiler("c") }}
       host:
         - cuda-version =${{ cuda_version }}
+        - cuda-sanitizer-api
         - libkvikio =${{ minor_version }}
         - librmm =${{ minor_version }}
         - libnvcomp-dev ${{ nvcomp_version }}
@@ -129,6 +137,7 @@ outputs:
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - cuda-nvrtc
+        - cuda-sanitizer-api
         - if: linux and x86_64
           then:
             - libcufile
@@ -148,6 +157,7 @@ outputs:
           - cuda-cudart
           - cuda-nvrtc
           - cuda-nvtx
+          - cuda-sanitizer-api
           - cuda-version
           - flatbuffers
           - libcufile
@@ -306,11 +316,13 @@ outputs:
         - cuda-version =${{ cuda_version }}
         - libcurand-dev
         - cuda-cudart-dev
+        - cuda-sanitizer-api
       run:
         - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
         - ${{ pin_subpackage("libcudf", exact=True) }}
         - ${{ pin_subpackage("libcudf_kafka", exact=True) }}
         - libcurand
+        - cuda-sanitizer-api
       ignore_run_exports:
         from_package:
           - libcurand-dev
@@ -318,6 +330,7 @@ outputs:
           - cuda-cudart
           - cuda-nvrtc
           - cuda-nvtx
+          - cuda-sanitizer-api
           - cuda-version
           - flatbuffers
           - libcudf
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5a0b2f95e83..c2485171c71 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -63,7 +63,7 @@ option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 
 set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON)
 
-if(CUDA_STATIC_RUNTIME OR NOT BUILD_SHARED_LIBS)
+if(NOT BUILD_SHARED_LIBS)
   set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL OFF)
 endif()
 
@@ -1137,13 +1137,6 @@ endif()
 
 # * build cudf_identify_stream_usage --------------------------------------------------------------
 if(CUDF_BUILD_STREAMS_TEST_UTIL)
-  if(CUDA_STATIC_RUNTIME)
-    message(
-      FATAL_ERROR
-        "Stream identification cannot be used with a static CUDA runtime. Please set CUDA_STATIC_RUNTIME=OFF or CUDF_BUILD_STREAMS_TEST_UTIL=OFF."
-    )
-  endif()
-
   # Libraries for stream-related testing. We build the library twice, one with STREAM_MODE_TESTING
   # on and one with it set to off. Each test will then be configured to use the appropriate library
   # depending via ctest and whether it has been updated to expose public stream APIs.
@@ -1157,6 +1150,9 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
       )
     endif()
 
+    set(sanitizer_relative_genex
+        "$<PATH:RELATIVE_PATH,$<TARGET_FILE_DIR:CUDA::sanitizer>,$<TARGET_FILE_DIR:CUDA::cudart>>"
+    )
     set_target_properties(
       ${_tgt}
       PROPERTIES # set target compile options
@@ -1164,13 +1160,13 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
                  CXX_STANDARD_REQUIRED ON
                  POSITION_INDEPENDENT_CODE ON
                  BUILD_RPATH "\$ORIGIN"
-                 INSTALL_RPATH "\$ORIGIN"
+                 INSTALL_RPATH "\$ORIGIN;\$ORIGIN/${sanitizer_relative_genex}"
     )
     target_compile_options(
       ${_tgt} PRIVATE "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
     )
     target_include_directories(${_tgt} PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>")
-    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm)
+    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm CUDA::sanitizer)
 
     rapids_cuda_set_runtime(${_tgt} USE_STATIC ${CUDA_STATIC_RUNTIME})
     add_library(cudf::${_tgt} ALIAS ${_tgt})
diff --git a/cpp/cmake/Modules/FindCUDAToolkit.cmake b/cpp/cmake/Modules/FindCUDAToolkit.cmake
new file mode 100644
index 00000000000..63b9baad90a
--- /dev/null
+++ b/cpp/cmake/Modules/FindCUDAToolkit.cmake
@@ -0,0 +1,1567 @@
+# SPDX-FileCopyrightText: Copyright 2000-2026 Kitware, Inc. and Contributors
+# SPDX-License-Identifier: BSD-3-Clause
+
+# Distributed under the OSI-approved BSD 3-Clause License.  See accompanying
+# file LICENSE.rst or https://cmake.org/licensing for details.
+
+#[=======================================================================[.rst:
+FindCUDAToolkit
+---------------
+
+.. versionadded:: 3.17
+
+Finds the NVIDIA CUDA toolkit and the associated libraries, but does not
+require the ``CUDA`` language be enabled for a given project:
+
+.. code-block:: cmake
+
+  find_package(CUDAToolkit [<version>] [QUIET] [REQUIRED] [EXACT] [...])
+
+This module does not search for the NVIDIA CUDA Samples.
+
+.. versionadded:: 3.19
+  QNX support.
+
+Search Behavior
+^^^^^^^^^^^^^^^
+
+The CUDA Toolkit search behavior uses the following order:
+
+1. If the ``CUDA`` language has been enabled we will use the directory
+   containing the compiler as the first search location for ``nvcc``.
+
+2. If the variable :variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>` or
+   the environment variable :envvar:`CUDACXX` is defined, it will be used
+   as the path to the ``nvcc`` executable.
+
+3. If the ``CUDAToolkit_ROOT`` cmake configuration variable (e.g.,
+   ``-DCUDAToolkit_ROOT=/some/path``) *or* environment variable is defined, it
+   will be searched.  If both an environment variable **and** a
+   configuration variable are specified, the *configuration* variable takes
+   precedence.
+
+   The directory specified here must be such that the executable ``nvcc`` or
+   the appropriate ``version.txt`` or ``version.json`` file can be found
+   underneath the specified directory.
+
+4. If the CUDA_PATH environment variable is defined, it will be searched
+   for ``nvcc``.
+
+5. The user's path is searched for ``nvcc`` using :command:`find_program`.  If
+   this is found, no subsequent search attempts are performed.  Users are
+   responsible for ensuring that the first ``nvcc`` to show up in the path is
+   the desired path in the event that multiple CUDA Toolkits are installed.
+
+6. On Unix systems, if the symbolic link ``/usr/local/cuda`` exists, this is
+   used.  No subsequent search attempts are performed.  No default symbolic link
+   location exists for the Windows platform.
+
+7. The platform specific default install locations are searched.  If exactly one
+   candidate is found, this is used.  The default CUDA Toolkit install locations
+   searched are:
+
+   +-------------+-------------------------------------------------------------+
+   | Platform    | Search Pattern                                              |
+   +=============+=============================================================+
+   | macOS       | ``/Developer/NVIDIA/CUDA-X.Y``                              |
+   +-------------+-------------------------------------------------------------+
+   | Other Unix  | ``/usr/local/cuda-X.Y``                                     |
+   +-------------+-------------------------------------------------------------+
+   | Windows     | ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y`` |
+   +-------------+-------------------------------------------------------------+
+
+   Where ``X.Y`` would be a specific version of the CUDA Toolkit, such as
+   ``/usr/local/cuda-9.0`` or
+   ``C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v9.0``
+
+   .. note::
+
+       When multiple CUDA Toolkits are installed in the default location of a
+       system (e.g., both ``/usr/local/cuda-9.0`` and ``/usr/local/cuda-10.0``
+       exist but the ``/usr/local/cuda`` symbolic link does **not** exist), this
+       package is marked as **not** found.
+
+       There are too many factors involved in making an automatic decision in
+       the presence of multiple CUDA Toolkits being installed.  In this
+       situation, users are encouraged to either (1) set ``CUDAToolkit_ROOT`` or
+       (2) ensure that the correct ``nvcc`` executable shows up in ``$PATH`` for
+       :command:`find_program` to find.
+
+Arguments
+^^^^^^^^^
+
+``[<version>]``
+    The ``[<version>]`` argument requests a version with which the package found
+    should be compatible. See :ref:`find_package version format <FIND_PACKAGE_VERSION_FORMAT>`
+    for more details.
+
+Options
+^^^^^^^
+
+``REQUIRED``
+    If specified, configuration will error if a suitable CUDA Toolkit is not
+    found.
+
+``QUIET``
+    If specified, the search for a suitable CUDA Toolkit will not produce any
+    messages.
+
+``EXACT``
+    If specified, the CUDA Toolkit is considered found only if the exact
+    ``VERSION`` specified is recovered.
+
+Imported Targets
+^^^^^^^^^^^^^^^^
+
+An :ref:`imported target <Imported targets>` named ``CUDA::toolkit`` is provided.
+
+This module provides :ref:`Imported Targets` for each
+of the following libraries that are part of the CUDAToolkit:
+
+- `CUDA Runtime Library`_
+- `CUDA Driver Library`_
+- `cuBLAS`_
+- `cuDLA`_
+- `cuFile`_
+- `cuFFT`_
+- `cuRAND`_
+- `cuSOLVER`_
+- `cuSPARSE`_
+- `cuPTI`_
+- `NPP`_
+- `nvBLAS`_
+- `nvGRAPH`_
+- `nvJPEG`_
+- `nvidia-ML`_
+- `nvPTX Compiler`_
+- `nvRTC`_
+- `nvJitLink`_
+- `nvFatBin`_
+- `nvToolsExt`_
+- `nvtx3`_
+- `OpenCL`_
+- `cuLIBOS`_
+
+CUDA Runtime Library
+""""""""""""""""""""
+
+The CUDA Runtime library (cudart) are what most applications will typically
+need to link against to make any calls such as `cudaMalloc`, and `cudaFree`.
+
+Targets Created:
+
+- ``CUDA::cudart``
+- ``CUDA::cudart_static``
+
+CUDA Driver Library
+""""""""""""""""""""
+
+The CUDA Driver library (cuda) are used by applications that use calls
+such as `cuMemAlloc`, and `cuMemFree`.
+
+Targets Created:
+
+- ``CUDA::cuda_driver``
+
+cuBLAS
+""""""
+
+The `CUDA Basic Linear Algebra Subroutine`_ library.
+
+Targets Created:
+
+- ``CUDA::cublas``
+- ``CUDA::cublas_static``
+- ``CUDA::cublasLt`` starting in CUDA 10.1
+- ``CUDA::cublasLt_static`` starting in CUDA 10.1
+
+.. _`CUDA Basic Linear Algebra Subroutine`: https://docs.nvidia.com/cuda/cublas
+
+cuDLA
+""""""
+
+.. versionadded:: 3.27
+
+The `NVIDIA Tegra Deep Learning Accelerator`_ library.
+
+Targets Created:
+
+- ``CUDA::cudla`` starting in CUDA 11.6
+
+.. _`NVIDIA Tegra Deep Learning Accelerator`: https://docs.nvidia.com/cuda/cuda-for-tegra-appnote#cudla
+
+cuFile
+""""""
+
+.. versionadded:: 3.25
+
+The `NVIDIA GPUDirect Storage cuFile`_ library.
+
+Targets Created:
+
+- ``CUDA::cuFile`` starting in CUDA 11.4
+- ``CUDA::cuFile_static`` starting in CUDA 11.4
+- ``CUDA::cuFile_rdma`` starting in CUDA 11.4
+- ``CUDA::cuFile_rdma_static`` starting in CUDA 11.4
+
+.. _`NVIDIA GPUDirect Storage cuFile`: https://docs.nvidia.com/gpudirect-storage/api-reference-guide
+
+cuFFT
+"""""
+
+The `CUDA Fast Fourier Transform`_ library.
+
+Targets Created:
+
+- ``CUDA::cufft``
+- ``CUDA::cufftw``
+- ``CUDA::cufft_static``
+- ``CUDA::cufft_static_nocallback`` starting in CUDA 9.2, requires CMake 3.23+
+- ``CUDA::cufftw_static``
+
+.. _`CUDA Fast Fourier Transform`: https://docs.nvidia.com/cuda/cufft
+
+cuRAND
+""""""
+
+The `CUDA random number generation`_ library.
+
+Targets Created:
+
+- ``CUDA::curand``
+- ``CUDA::curand_static``
+
+.. _`CUDA random number generation`: https://docs.nvidia.com/cuda/curand
+
+cuSOLVER
+""""""""
+
+A `GPU accelerated linear system solver`_ library.
+
+Targets Created:
+
+- ``CUDA::cusolver``
+- ``CUDA::cusolver_static``
+
+.. _`GPU accelerated linear system solver`: https://docs.nvidia.com/cuda/cusolver
+
+cuSPARSE
+""""""""
+
+The `CUDA sparse matrix`_ library.
+
+Targets Created:
+
+- ``CUDA::cusparse``
+- ``CUDA::cusparse_static``
+
+.. _`CUDA sparse matrix`: https://docs.nvidia.com/cuda/cusparse
+
+cupti
+"""""
+
+The `NVIDIA CUDA Profiling Tools Interface`_.
+
+Targets Created:
+
+- ``CUDA::cupti``
+- ``CUDA::cupti_static``
+
+.. versionadded:: 3.27
+
+  - ``CUDA::nvperf_host``         starting in CUDA 10.2
+  - ``CUDA::nvperf_host_static``  starting in CUDA 10.2
+  - ``CUDA::nvperf_target``       starting in CUDA 10.2
+  - ``CUDA::pcsamplingutil``      starting in CUDA 11.3
+
+.. _`NVIDIA CUDA Profiling Tools Interface`: https://developer.nvidia.com/cupti
+
+NPP
+"""
+
+The `NVIDIA 2D Image and Signal Processing Performance Primitives`_ libraries.
+
+Targets Created:
+
+- `nppc`:
+
+  - ``CUDA::nppc``
+  - ``CUDA::nppc_static``
+
+- `nppial`: Arithmetic and logical operation functions in `nppi_arithmetic_and_logical_operations.h`
+
+  - ``CUDA::nppial``
+  - ``CUDA::nppial_static``
+
+- `nppicc`: Color conversion and sampling functions in `nppi_color_conversion.h`
+
+  - ``CUDA::nppicc``
+  - ``CUDA::nppicc_static``
+
+- `nppicom`: JPEG compression and decompression functions in `nppi_compression_functions.h`
+  Removed starting in CUDA 11.0, use `nvJPEG`_ instead.
+
+  - ``CUDA::nppicom``
+  - ``CUDA::nppicom_static``
+
+- `nppidei`: Data exchange and initialization functions in `nppi_data_exchange_and_initialization.h`
+
+  - ``CUDA::nppidei``
+  - ``CUDA::nppidei_static``
+
+- `nppif`: Filtering and computer vision functions in `nppi_filter_functions.h`
+
+  - ``CUDA::nppif``
+  - ``CUDA::nppif_static``
+
+- `nppig`: Geometry transformation functions found in `nppi_geometry_transforms.h`
+
+  - ``CUDA::nppig``
+  - ``CUDA::nppig_static``
+
+- `nppim`: Morphological operation functions found in `nppi_morphological_operations.h`
+
+  - ``CUDA::nppim``
+  - ``CUDA::nppim_static``
+
+- `nppist`: Statistics and linear transform in `nppi_statistics_functions.h` and `nppi_linear_transforms.h`
+
+  - ``CUDA::nppist``
+  - ``CUDA::nppist_static``
+
+- `nppisu`: Memory support functions in `nppi_support_functions.h`
+
+  - ``CUDA::nppisu``
+  - ``CUDA::nppisu_static``
+
+- `nppitc`: Threshold and compare operation functions in `nppi_threshold_and_compare_operations.h`
+
+  - ``CUDA::nppitc``
+  - ``CUDA::nppitc_static``
+
+- `npps`:
+
+  - ``CUDA::npps``
+  - ``CUDA::npps_static``
+
+.. _`NVIDIA 2D Image and Signal Processing Performance Primitives`: https://docs.nvidia.com/cuda/npp
+
+nvBLAS
+""""""
+
+The `GPU-accelerated drop-in BLAS`_ library.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvblas``
+
+.. _`GPU-accelerated drop-in BLAS`: https://docs.nvidia.com/cuda/nvblas
+
+nvGRAPH
+"""""""
+
+A `GPU-accelerated graph analytics`_ library.
+Removed starting in CUDA 11.0
+
+Targets Created:
+
+- ``CUDA::nvgraph``
+- ``CUDA::nvgraph_static``
+
+.. _`GPU-accelerated graph analytics`: https://docs.nvidia.com/cuda/archive/10.0/nvgraph
+
+nvJPEG
+""""""
+
+A `GPU-accelerated JPEG codec`_ library.
+Introduced in CUDA 10.
+
+Targets Created:
+
+- ``CUDA::nvjpeg``
+- ``CUDA::nvjpeg_static``
+
+.. _`GPU-accelerated JPEG codec`: https://docs.nvidia.com/cuda/nvjpeg
+
+nvPTX Compiler
+""""""""""""""
+
+.. versionadded:: 3.25
+
+The `PTX Compiler APIs`_.
+These are a set of APIs which can be used to compile a PTX program into GPU assembly code.
+Introduced in CUDA 11.1
+This is a static library only.
+
+Targets Created:
+
+- ``CUDA::nvptxcompiler_static`` starting in CUDA 11.1
+
+.. _`PTX Compiler APIs`: https://docs.nvidia.com/cuda/ptx-compiler-api
+
+nvRTC
+"""""
+
+A `runtime compilation library for CUDA`_.
+
+Targets Created:
+
+- ``CUDA::nvrtc``
+
+.. versionadded:: 3.26
+
+  - ``CUDA::nvrtc_builtins``
+  - ``CUDA::nvrtc_static`` starting in CUDA 11.5
+  - ``CUDA::nvrtc_builtins_static`` starting in CUDA 11.5
+
+.. _`runtime compilation library for CUDA`: https://docs.nvidia.com/cuda/nvrtc
+
+nvJitLink
+"""""""""
+
+The `JIT Link APIs`_.
+
+Targets Created:
+
+- ``CUDA::nvJitLink`` starting in CUDA 12.0
+- ``CUDA::nvJitLink_static``  starting in CUDA 12.0
+
+.. _`JIT Link APIs`: https://docs.nvidia.com/cuda/nvjitlink
+
+nvFatBin
+"""""""""
+
+.. versionadded:: 3.30
+
+The `Fatbin Creator APIs`_.
+
+Targets Created:
+
+- ``CUDA::nvfatbin`` starting in CUDA 12.4
+- ``CUDA::nvfatbin_static``  starting in CUDA 12.4
+
+.. _`Fatbin Creator APIs`: https://docs.nvidia.com/cuda/nvfatbin
+
+nvidia-ML
+"""""""""
+
+The `NVIDIA Management Library`_.
+
+Targets Created:
+
+- ``CUDA::nvml``
+- ``CUDA::nvml_static`` starting in CUDA 12.4
+
+.. versionadded:: 3.31
+  Added ``CUDA::nvml_static``.
+
+.. _`NVIDIA Management Library`: https://developer.nvidia.com/management-library-nvml
+
+.. _`FindCUDAToolkit_nvToolsExt`:
+
+nvToolsExt
+""""""""""
+
+.. deprecated:: 3.25
+
+  With CUDA 10.0+, use `nvtx3`_.
+  Starting in CUDA 12.9 the `nvToolsExt` library no longer exists
+
+The `legacy NVIDIA Tools Extension`_.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::nvToolsExt``
+
+.. _`legacy NVIDIA Tools Extension`: https://docs.nvidia.com/cuda/archive/9.0/profiler-users-guide#nvtx
+
+.. _`FindCUDAToolkit_nvtx3`:
+
+nvtx3
+"""""
+
+.. versionadded:: 3.25
+
+The header-only `NVIDIA Tools Extension`_ library.
+Introduced in CUDA 10.0.
+
+Targets created:
+
+- ``CUDA::nvtx3``
+
+
+- ``CUDA::nvtx3_interop``
+
+  .. versionadded:: 4.1
+
+  This is provided by CUDA 12.9 and above for use by languages that
+  cannot consume C++ header-only libraries, such as ``Fortran``.
+
+.. _`NVIDIA Tools Extension`: https://nvidia.github.io/NVTX/doxygen
+
+OpenCL
+""""""
+
+The `NVIDIA Open Computing Language`_ library.
+This is a shared library only.
+
+Targets Created:
+
+- ``CUDA::OpenCL``
+
+.. _`NVIDIA Open Computing Language`: https://developer.nvidia.com/opencl
+
+cuLIBOS
+"""""""
+
+The cuLIBOS library is a backend thread abstraction layer library which is
+static only.  The ``CUDA::cublas_static``, ``CUDA::cusparse_static``,
+``CUDA::cufft_static``, ``CUDA::curand_static``, and (when implemented) NPP
+libraries all automatically have this dependency linked.
+
+Target Created:
+
+- ``CUDA::culibos``
+
+**Note**: direct usage of this target by consumers should not be necessary.
+
+.. _`FindCUDAToolkit_bin2c`:
+
+bin2c
+"""""
+
+.. versionadded:: 4.3
+
+A utility that converts binary files to C files containing byte arrays.
+
+Target Created:
+
+- ``CUDA::bin2c``
+
+.. _`FindCUDAToolkit_sanitizer`:
+
+compute-sanitizer
+"""""""""""""""""
+
+.. versionadded:: 4.4
+
+The `NVIDIA Compute Sanitizer`_ library, which allows the tracing of CUDA
+runtime and driver calls.
+
+Target Created:
+
+- ``CUDA::sanitizer``
+
+.. _`NVIDIA Compute Sanitizer`: https://docs.nvidia.com/compute-sanitizer
+
+Result Variables
+^^^^^^^^^^^^^^^^
+
+This module defines the following variables:
+
+``CUDAToolkit_FOUND``
+    A boolean specifying whether or not the CUDA Toolkit was found.
+
+``CUDAToolkit_VERSION``
+    The exact version of the CUDA Toolkit found (as reported by
+    ``nvcc --version``, ``version.txt``, or ``version.json``).
+
+``CUDAToolkit_VERSION_MAJOR``
+    The major version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_MINOR``
+    The minor version of the CUDA Toolkit.
+
+``CUDAToolkit_VERSION_PATCH``
+    The patch version of the CUDA Toolkit.
+
+``CUDAToolkit_BIN_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    executable ``nvcc``.
+
+``CUDAToolkit_INCLUDE_DIRS``
+    List of paths to all the CUDA Toolkit folders containing header files
+    required to compile a project linking against CUDA.
+
+``CUDAToolkit_LIBRARY_DIR``
+    The path to the CUDA Toolkit library directory that contains the CUDA
+    Runtime library ``cudart``.
+
+``CUDAToolkit_LIBRARY_ROOT``
+    .. versionadded:: 3.18
+
+    The path to the CUDA Toolkit directory containing the nvvm directory and
+    either version.txt or version.json.
+
+``CUDAToolkit_TARGET_DIR``
+    The path to the CUDA Toolkit directory including the target architecture
+    when cross-compiling. When not cross-compiling this will be equivalent to
+    the parent directory of ``CUDAToolkit_BIN_DIR``.
+
+``CUDAToolkit_NVCC_EXECUTABLE``
+    The path to the NVIDIA CUDA compiler ``nvcc``.  Note that this path may
+    **not** be the same as
+    :variable:`CMAKE_CUDA_COMPILER <CMAKE_<LANG>_COMPILER>`.  ``nvcc`` must be
+    found to determine the CUDA Toolkit version as well as determining other
+    features of the Toolkit.  This variable is set for the convenience of
+    modules that depend on this one.
+#]=======================================================================]
+
+# NOTE: much of this was simply extracted from FindCUDA.cmake.
+
+#   James Bigler, NVIDIA Corp (nvidia.com - jbigler)
+#   Abe Stephens, SCI Institute -- http://www.sci.utah.edu/~abe/FindCuda.html
+#
+#   Copyright (c) 2008 - 2009 NVIDIA Corporation.  All rights reserved.
+#
+#   Copyright (c) 2007-2009
+#   Scientific Computing and Imaging Institute, University of Utah
+#
+#   This code is licensed under the MIT License.  See the FindCUDA.cmake script
+#   for the text of the license.
+
+# The MIT License
+#
+# License for the specific language governing rights and limitations under
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+#
+###############################################################################
+
+function(_CUDAToolkit_build_include_dirs result_variable default_paths_variable)
+  set(content "${${default_paths_variable}}")
+  set(${result_variable} "${content}" PARENT_SCOPE)
+endfunction()
+
+function(_CUDAToolkit_build_library_dirs result_variable default_paths_variable)
+  set(content "${${default_paths_variable}}")
+  set(${result_variable} "${content}" PARENT_SCOPE)
+endfunction()
+
+# The toolkit is located during compiler detection for CUDA and stored in CMakeCUDACompiler.cmake as
+# - CMAKE_CUDA_COMPILER_TOOLKIT_ROOT
+# - CMAKE_CUDA_COMPILER_LIBRARY_ROOT
+# - CMAKE_CUDA_COMPILER_LIBRARY_DIRECTORIES_FROM_IMPLICIT_LIBRARIES
+# - CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES
+# We compute the rest based on those here to avoid re-searching and to avoid finding a possibly
+# different installation.
+if(CMAKE_CUDA_COMPILER_TOOLKIT_ROOT)
+  set(CUDAToolkit_ROOT_DIR "${CMAKE_CUDA_COMPILER_TOOLKIT_ROOT}")
+  set(CUDAToolkit_LIBRARY_ROOT "${CMAKE_CUDA_COMPILER_LIBRARY_ROOT}")
+  _CUDAToolkit_build_library_dirs(CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES CMAKE_CUDA_HOST_IMPLICIT_LINK_DIRECTORIES)
+  _CUDAToolkit_build_include_dirs(CUDAToolkit_INCLUDE_DIRECTORIES CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES)
+  set(CUDAToolkit_BIN_DIR "${CUDAToolkit_ROOT_DIR}/bin")
+  set(CUDAToolkit_NVCC_EXECUTABLE "${CUDAToolkit_BIN_DIR}/nvcc${CMAKE_EXECUTABLE_SUFFIX}")
+  set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_TOOLKIT_VERSION}")
+
+  if(CUDAToolkit_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+    set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+    set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+    set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+  endif()
+else()
+  function(_CUDAToolkit_find_root_dir )
+    cmake_parse_arguments(arg "COMPILER_PATHS" "" "SEARCH_PATHS;FIND_FLAGS" ${ARGN})
+
+    if(NOT CUDAToolkit_BIN_DIR)
+      if(arg_COMPILER_PATHS)
+        # need to find parent dir, since this could clang and not nvcc
+        if(EXISTS "${CMAKE_CUDA_COMPILER}")
+          get_filename_component(possible_nvcc_path "${CMAKE_CUDA_COMPILER}" PROGRAM PROGRAM_ARGS CUDAToolkit_compiler_args)
+          get_filename_component(possible_nvcc_path "${possible_nvcc_path}" DIRECTORY)
+        elseif(EXISTS "$ENV{CUDACXX}")
+          get_filename_component(possible_nvcc_path "$ENV{CUDACXX}" PROGRAM PROGRAM_ARGS CUDAToolkit_compiler_args)
+          get_filename_component(possible_nvcc_path "${possible_nvcc_path}" DIRECTORY)
+        endif()
+        if(possible_nvcc_path)
+          find_program(CUDAToolkit_NVCC_EXECUTABLE
+            NAMES nvcc nvcc.exe
+            NO_DEFAULT_PATH
+            PATHS ${possible_nvcc_path}
+          )
+        endif()
+      else()
+        if(NOT CUDAToolkit_SENTINEL_FILE)
+          find_program(CUDAToolkit_NVCC_EXECUTABLE
+            NAMES nvcc nvcc.exe
+            PATHS ${arg_SEARCH_PATHS}
+            ${arg_FIND_FLAGS}
+          )
+        endif()
+
+        if(NOT CUDAToolkit_NVCC_EXECUTABLE)
+          find_file(CUDAToolkit_SENTINEL_FILE
+            NAMES version.txt version.json
+            PATHS ${arg_SEARCH_PATHS}
+            NO_DEFAULT_PATH
+          )
+        endif()
+      endif()
+
+      if(EXISTS "${CUDAToolkit_NVCC_EXECUTABLE}")
+        # If NVCC exists  then invoke it to find the toolkit location.
+        # This allows us to support wrapper scripts (e.g. ccache or colornvcc), CUDA Toolkit,
+        # NVIDIA HPC SDK, and distro's splayed layouts
+
+
+        #Allow the user to specify a host compiler except for Visual Studio
+        if(NOT $ENV{CUDAHOSTCXX} STREQUAL "")
+          get_filename_component(CUDAToolkit_CUDA_HOST_COMPILER $ENV{CUDAHOSTCXX} PROGRAM)
+          if(NOT EXISTS ${CUDAToolkit_CUDA_HOST_COMPILER})
+            message(FATAL_ERROR "Could not find the compiler specified in the environment variable CUDAHOSTCXX:\n$ENV{CUDAHOSTCXX}.\n${CUDAToolkit_CUDA_HOST_COMPILER}")
+          endif()
+        elseif(CUDAToolkit_CUDA_HOST_COMPILER)
+          # We get here if CUDAToolkit_CUDA_HOST_COMPILER was specified by the user or toolchain file.
+          if(IS_ABSOLUTE "${CUDAToolkit_CUDA_HOST_COMPILER}")
+            # Convert to forward slashes.
+            cmake_path(CONVERT "${CUDAToolkit_CUDA_HOST_COMPILER}" TO_CMAKE_PATH_LIST CUDAToolkit_CUDA_HOST_COMPILER NORMALIZE)
+          else()
+            # Convert to absolute path so changes in `PATH` do not impact CUDA compilation.
+            find_program(_CUDAToolkit_CUDA_HOST_COMPILER_PATH NO_CACHE NAMES "${CUDAToolkit_CUDA_HOST_COMPILER}")
+            if(_CUDAToolkit_CUDA_HOST_COMPILER_PATH)
+              set(CUDAToolkit_CUDA_HOST_COMPILER "${_CUDAToolkit_CUDA_HOST_COMPILER_PATH}")
+            endif()
+            unset(_CUDAToolkit_CUDA_HOST_COMPILER_PATH)
+          endif()
+          if(NOT EXISTS "${CUDAToolkit_CUDA_HOST_COMPILER}")
+            message(FATAL_ERROR "Could not find the compiler specified in the variable CUDAToolkit_CUDA_HOST_COMPILER:\n  ${CUDAToolkit_CUDA_HOST_COMPILER}")
+          endif()
+          # If the value was cached, update the cache entry with our modifications.
+          get_property(_CUDAToolkit_CUDA_HOST_COMPILER_CACHED CACHE CUDAToolkit_CUDA_HOST_COMPILER PROPERTY TYPE)
+          if(_CUDAToolkit_CUDA_HOST_COMPILER_CACHED)
+            set_property(CACHE CUDAToolkit_CUDA_HOST_COMPILER PROPERTY VALUE "${CUDAToolkit_CUDA_HOST_COMPILER}")
+            mark_as_advanced(CUDAToolkit_CUDA_HOST_COMPILER)
+          endif()
+          unset(_CUDAToolkit_CUDA_HOST_COMPILER_CACHED)
+        endif()
+
+        if(CUDAToolkit_CUDA_HOST_COMPILER)
+          set(nvcc_ccbin_flag "-ccbin=${CUDAToolkit_CUDA_HOST_COMPILER}")
+        endif()
+        execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "${nvcc_ccbin_flag}" "-v" "__cmake_determine_cuda"
+          OUTPUT_VARIABLE _CUDA_NVCC_OUT ERROR_VARIABLE _CUDA_NVCC_OUT)
+        message(CONFIGURE_LOG
+          "Executed nvcc to extract CUDAToolkit information:\n${_CUDA_NVCC_OUT}\n\n")
+        if(_CUDA_NVCC_OUT MATCHES "\\#\\$ TOP=([^\r\n]*)")
+          get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_MATCH_1}/bin" ABSOLUTE)
+          message(CONFIGURE_LOG
+            "Parsed CUDAToolkit nvcc location:\n${CUDAToolkit_BIN_DIR}\n\n")
+        else()
+          get_filename_component(CUDAToolkit_BIN_DIR "${CUDAToolkit_NVCC_EXECUTABLE}" DIRECTORY)
+        endif()
+        if(_CUDA_NVCC_OUT MATCHES "\\#\\$ INCLUDES=([^\r\n]*)")
+          separate_arguments(_nvcc_output NATIVE_COMMAND "${CMAKE_MATCH_1}")
+          foreach(line IN LISTS _nvcc_output)
+            string(REGEX REPLACE "^-I" "" line "${line}")
+            get_filename_component(line "${line}" ABSOLUTE)
+            list(APPEND _cmake_CUDAToolkit_include_directories "${line}")
+          endforeach()
+        endif()
+        if(_CUDA_NVCC_OUT MATCHES "\\#\\$ SYSTEM_INCLUDES=([^\r\n]*)")
+          unset(_nvcc_output)
+          separate_arguments(_nvcc_output NATIVE_COMMAND "${CMAKE_MATCH_1}")
+          foreach(line IN LISTS _nvcc_output)
+            string(REGEX REPLACE "^-isystem" "" line "${line}")
+            if(line)
+              get_filename_component(line "${line}" ABSOLUTE)
+              list(APPEND _cmake_CUDAToolkit_include_directories "${line}")
+            endif()
+          endforeach()
+        endif()
+        if(DEFINED _cmake_CUDAToolkit_include_directories)
+          message(CONFIGURE_LOG
+            "Parsed CUDAToolkit nvcc implicit include information:\n${_cmake_CUDAToolkit_include_directories}\n\n")
+          set(_cmake_CUDAToolkit_include_directories "${_cmake_CUDAToolkit_include_directories}" CACHE INTERNAL "CUDAToolkit internal list of include directories")
+        endif()
+        if(_CUDA_NVCC_OUT MATCHES "\\#\\$ LIBRARIES=([^\r\n]*)")
+          include(${CMAKE_ROOT}/Modules/CMakeParseImplicitLinkInfo.cmake)
+          set(_nvcc_link_line "cuda-fake-ld ${CMAKE_MATCH_1}")
+          CMAKE_PARSE_IMPLICIT_LINK_INFO("${_nvcc_link_line}"
+                                   _cmake_CUDAToolkit_implicit_link_libs
+                                   _cmake_CUDAToolkit_implicit_link_directories
+                                   _cmake_CUDAToolkit_implicit_frameworks
+                                   _nvcc_log
+                                   "${CMAKE_CUDA_IMPLICIT_OBJECT_REGEX}"
+                                   LANGUAGE CUDA)
+          message(CONFIGURE_LOG
+          "Parsed CUDAToolkit nvcc implicit link information:\n${_nvcc_log}\n${_cmake_CUDAToolkit_implicit_link_directories}\n\n")
+          unset(_nvcc_link_line)
+          unset(_cmake_CUDAToolkit_implicit_link_libs)
+          unset(_cmake_CUDAToolkit_implicit_frameworks)
+
+          set(_cmake_CUDAToolkit_implicit_link_directories "${_cmake_CUDAToolkit_implicit_link_directories}" CACHE INTERNAL "CUDAToolkit internal list of implicit link directories")
+        endif()
+        unset(_CUDA_NVCC_OUT)
+
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE)
+        mark_as_advanced(CUDAToolkit_BIN_DIR)
+      endif()
+
+      if(CUDAToolkit_SENTINEL_FILE)
+        get_filename_component(CUDAToolkit_BIN_DIR ${CUDAToolkit_SENTINEL_FILE} DIRECTORY ABSOLUTE)
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}/bin")
+
+        set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "" FORCE)
+        mark_as_advanced(CUDAToolkit_BIN_DIR)
+      endif()
+    endif()
+
+    if(DEFINED _cmake_CUDAToolkit_include_directories)
+      _CUDAToolkit_build_include_dirs(_cmake_CUDAToolkit_contents _cmake_CUDAToolkit_include_directories)
+      set(CUDAToolkit_INCLUDE_DIRECTORIES "${_cmake_CUDAToolkit_contents}" PARENT_SCOPE)
+    endif()
+    if(DEFINED _cmake_CUDAToolkit_implicit_link_directories)
+      _CUDAToolkit_build_library_dirs(_cmake_CUDAToolkit_contents _cmake_CUDAToolkit_implicit_link_directories)
+      set(CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES "${_cmake_CUDAToolkit_contents}" PARENT_SCOPE)
+    endif()
+
+    if(CUDAToolkit_BIN_DIR)
+      get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE)
+      set(CUDAToolkit_ROOT_DIR "${CUDAToolkit_ROOT_DIR}" PARENT_SCOPE)
+    endif()
+
+  endfunction()
+
+  function(_CUDAToolkit_guess_root_dir)
+    # CUDAToolkit_ROOT cmake / env variable not specified, try platform defaults.
+    #
+    # - Linux: /usr/local/cuda-X.Y
+    # - macOS: /Developer/NVIDIA/CUDA-X.Y
+    # - Windows: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\vX.Y
+    #
+    # We will also search the default symlink location /usr/local/cuda first since
+    # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked
+    # directory is the desired location.
+    if(UNIX)
+      if(NOT APPLE)
+        set(platform_base "/usr/local/cuda-")
+      else()
+        set(platform_base "/Developer/NVIDIA/CUDA-")
+      endif()
+    else()
+      set(platform_base "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v")
+    endif()
+
+    # Build out a descending list of possible cuda installations, e.g.
+    file(GLOB possible_paths "${platform_base}*")
+    # Iterate the glob results and create a descending list.
+    set(versions)
+    foreach(p ${possible_paths})
+      # Extract version number from end of string
+      string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p})
+      if(IS_DIRECTORY ${p} AND p_version)
+        list(APPEND versions ${p_version})
+      endif()
+    endforeach()
+
+    # Sort numerically in descending order, so we try the newest versions first.
+    list(SORT versions COMPARE NATURAL ORDER DESCENDING)
+
+    # With a descending list of versions, populate possible paths to search.
+    set(search_paths)
+    foreach(v ${versions})
+      list(APPEND search_paths "${platform_base}${v}")
+    endforeach()
+
+    # Force the global default /usr/local/cuda to the front on Unix.
+    if(UNIX)
+      list(INSERT search_paths 0 "/usr/local/cuda")
+    endif()
+
+    # Now search for the toolkit again using the platform default search paths.
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${search_paths}" FIND_FLAGS PATH_SUFFIXES bin)
+    if(CUDAToolkit_ROOT_DIR)
+      set(CUDAToolkit_ROOT_DIR "${CUDAToolkit_ROOT_DIR}" PARENT_SCOPE)
+    endif()
+
+    # We are done with these variables now, cleanup for caller.
+    unset(platform_base)
+    unset(possible_paths)
+    unset(versions)
+    unset(search_paths)
+  endfunction()
+
+  function(_CUDAToolkit_find_version_file result_variable)
+    # We first check for a non-scattered installation to prefer it over a scattered installation.
+    set(version_files version.txt version.json)
+    foreach(vf IN LISTS version_files)
+      if(CUDAToolkit_ROOT AND EXISTS "${CUDAToolkit_ROOT}/${vf}")
+        set(${result_variable} "${CUDAToolkit_ROOT}/${vf}" PARENT_SCOPE)
+        break()
+      elseif(CUDAToolkit_ROOT_DIR AND EXISTS "${CUDAToolkit_ROOT_DIR}/${vf}")
+        set(${result_variable} "${CUDAToolkit_ROOT_DIR}/${vf}" PARENT_SCOPE)
+        break()
+      elseif(CMAKE_SYSROOT_LINK AND EXISTS "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/${vf}")
+        set(${result_variable} "${CMAKE_SYSROOT_LINK}/usr/lib/cuda/${vf}" PARENT_SCOPE)
+        break()
+      elseif(EXISTS "${CMAKE_SYSROOT}/usr/lib/cuda/${vf}")
+        set(${result_variable} "${CMAKE_SYSROOT}/usr/lib/cuda/${vf}" PARENT_SCOPE)
+        break()
+      endif()
+    endforeach()
+  endfunction()
+
+  function(_CUDAToolkit_parse_version_file version_file)
+    if(version_file)
+      file(READ "${version_file}" file_contents)
+      cmake_path(GET version_file EXTENSION LAST_ONLY version_ext)
+      if(version_ext STREQUAL ".json")
+        string(JSON cuda_version_info GET "${file_contents}" "cuda" "version")
+        set(cuda_version_match_regex [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      elseif(version_ext STREQUAL ".txt")
+        set(cuda_version_info "${file_contents}")
+        set(cuda_version_match_regex [=[CUDA Version ([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      endif()
+
+      if(cuda_version_info MATCHES "${cuda_version_match_regex}")
+        set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}" PARENT_SCOPE)
+        set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}" PARENT_SCOPE)
+        set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}" PARENT_SCOPE)
+        set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}" PARENT_SCOPE)
+      endif()
+    endif()
+  endfunction()
+
+  macro(_CUDAToolkit_find_failure_message _CUDAToolkit_fail_mode)
+    # Declare error messages now, print later depending on find_package args.
+    if("${_CUDAToolkit_fail_mode}" STREQUAL "GUESS")
+      set(_CUDAToolkit_fail_message "Could not find `nvcc` executable in any searched paths, please set CUDAToolkit_ROOT")
+    elseif("${_CUDAToolkit_fail_mode}" STREQUAL "VARIABLE")
+      set(_CUDAToolkit_fail_message "Could not find `nvcc` executable in path specified by variable CUDAToolkit_ROOT=${CUDAToolkit_ROOT}")
+    else()
+      set(_CUDAToolkit_fail_message "Could not find `nvcc` executable in path specified by environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}")
+    endif()
+
+    if(CUDAToolkit_FIND_REQUIRED)
+      message(FATAL_ERROR ${_CUDAToolkit_fail_message})
+    else()
+      if(NOT CUDAToolkit_FIND_QUIETLY)
+        message(STATUS ${_CUDAToolkit_fail_message})
+      endif()
+      set(CUDAToolkit_FOUND FALSE)
+      unset(_CUDAToolkit_fail_message)
+      return()
+    endif()
+  endmacro()
+
+  # For NVCC we can easily deduce the SDK binary directory from the compiler path.
+  if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
+    get_filename_component(CUDAToolkit_BIN_DIR "${CMAKE_CUDA_COMPILER}" DIRECTORY)
+    set(CUDAToolkit_BIN_DIR "${CUDAToolkit_BIN_DIR}" CACHE PATH "")
+    # Try language provided path first.
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_BIN_DIR}" FIND_FLAGS NO_DEFAULT_PATH)
+    mark_as_advanced(CUDAToolkit_BIN_DIR)
+  endif()
+
+  # Try `CMAKE_CUDA_COMPILER` and `ENV{CUDACXX}`
+  if(NOT CUDAToolkit_ROOT_DIR)
+    _CUDAToolkit_find_root_dir(COMPILER_PATHS)
+  endif()
+
+  # Try user provided path
+  if(NOT CUDAToolkit_ROOT_DIR AND DEFINED CUDAToolkit_ROOT)
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "${CUDAToolkit_ROOT}" FIND_FLAGS PATH_SUFFIXES bin NO_DEFAULT_PATH)
+    if(NOT CUDAToolkit_ROOT_DIR)
+      # If the user specified CUDAToolkit_ROOT but the toolkit could not be found, this is an error.
+      _CUDAToolkit_find_failure_message(VARIABLE)
+    endif()
+  endif()
+
+  if(NOT CUDAToolkit_ROOT_DIR AND DEFINED ENV{CUDAToolkit_ROOT})
+    _CUDAToolkit_find_root_dir(SEARCH_PATHS "$ENV{CUDAToolkit_ROOT}" FIND_FLAGS PATH_SUFFIXES bin NO_DEFAULT_PATH)
+    if(NOT CUDAToolkit_ROOT_DIR)
+      # If the user specified ENV{CUDAToolkit_ROOT} but the toolkit could not be found, this is an error.
+      _CUDAToolkit_find_failure_message(ENV)
+    endif()
+  endif()
+
+  # Try users PATH, and CUDA_PATH env variable
+  if(NOT CUDAToolkit_ROOT_DIR)
+    _CUDAToolkit_find_root_dir(FIND_FLAGS PATHS ENV CUDA_PATH PATH_SUFFIXES bin)
+  endif()
+
+  # Try guessing where CUDA is installed
+  if(NOT CUDAToolkit_ROOT_DIR)
+    _CUDAToolkit_guess_root_dir()
+    if(NOT CUDAToolkit_ROOT_DIR)
+      _CUDAToolkit_find_failure_message(GUESS)
+    endif()
+  endif()
+
+  _CUDAToolkit_find_version_file( _CUDAToolkit_version_file )
+  if(_CUDAToolkit_version_file)
+    # CUDAToolkit_LIBRARY_ROOT contains the device library and version file.
+    get_filename_component(CUDAToolkit_LIBRARY_ROOT "${_CUDAToolkit_version_file}" DIRECTORY ABSOLUTE)
+  endif()
+  unset(_CUDAToolkit_version_file)
+
+  if(CUDAToolkit_NVCC_EXECUTABLE AND
+     CMAKE_CUDA_COMPILER_VERSION AND
+     CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER)
+    # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value
+    # This if statement will always match, but is used to provide variables for MATCH 1,2,3...
+    if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+      set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+      set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+      set(CUDAToolkit_VERSION "${CMAKE_CUDA_COMPILER_VERSION}")
+    endif()
+  elseif(CUDAToolkit_NVCC_EXECUTABLE)
+    # Compute the version by invoking nvcc
+    execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT)
+    if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=])
+      set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}")
+      set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}")
+      set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}")
+      set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}")
+    endif()
+    unset(NVCC_OUT)
+  else()
+    _CUDAToolkit_find_version_file(version_file)
+    _CUDAToolkit_parse_version_file("${version_file}")
+  endif()
+endif()
+
+# Figure out the target directory when either crosscompiling
+# or if we don't have `nvcc` and need to deduce the target arch
+if(CMAKE_CROSSCOMPILING OR NOT CUDAToolkit_NVCC_EXECUTABLE)
+  # When a language is enabled we can use its compiler's target architecture.
+  if(CMAKE_CUDA_COMPILER_LOADED AND CMAKE_CUDA_COMPILER_ARCHITECTURE_ID)
+    set(_CUDA_TARGET_PROCESSOR "${CMAKE_CUDA_COMPILER_ARCHITECTURE_ID}")
+  elseif(CMAKE_CXX_COMPILER_LOADED AND CMAKE_CXX_COMPILER_ARCHITECTURE_ID)
+    set(_CUDA_TARGET_PROCESSOR "${CMAKE_CXX_COMPILER_ARCHITECTURE_ID}")
+  elseif(CMAKE_C_COMPILER_LOADED AND CMAKE_C_COMPILER_ARCHITECTURE_ID)
+    set(_CUDA_TARGET_PROCESSOR "${CMAKE_C_COMPILER_ARCHITECTURE_ID}")
+  elseif(CMAKE_SYSTEM_PROCESSOR)
+    set(_CUDA_TARGET_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}")
+  elseif(CMAKE_CROSSCOMPILING)
+    message(FATAL_ERROR "Cross-compiling with the CUDA toolkit requires CMAKE_SYSTEM_PROCESSOR to be set.")
+  endif()
+  # Keep in sync with equivalent table in CMakeDetermineCUDACompiler and FindCUDA!
+  if(_CUDA_TARGET_PROCESSOR STREQUAL "armv7-a")
+    # Support for NVPACK
+    set(CUDAToolkit_TARGET_NAMES "armv7-linux-androideabi")
+  elseif(_CUDA_TARGET_PROCESSOR MATCHES "arm")
+    set(CUDAToolkit_TARGET_NAMES "armv7-linux-gnueabihf")
+  elseif(_CUDA_TARGET_PROCESSOR MATCHES "aarch64")
+    if(ANDROID_ARCH_NAME STREQUAL "arm64")
+      set(CUDAToolkit_TARGET_NAMES "aarch64-linux-androideabi")
+    elseif (CMAKE_SYSTEM_NAME STREQUAL "QNX")
+      set(CUDAToolkit_TARGET_NAMES "aarch64-qnx")
+    else()
+      set(CUDAToolkit_TARGET_NAMES "aarch64-linux" "sbsa-linux")
+    endif()
+  elseif(_CUDA_TARGET_PROCESSOR STREQUAL "x86_64")
+    set(CUDAToolkit_TARGET_NAMES "x86_64-linux")
+  endif()
+  unset(_CUDA_TARGET_PROCESSOR)
+
+  foreach(CUDAToolkit_TARGET_NAME IN LISTS CUDAToolkit_TARGET_NAMES)
+    if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+      set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}")
+      # add known CUDA target root path to the set of directories we search for programs, libraries and headers
+      list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}")
+
+      # Mark that we need to pop the root search path changes after we have
+      # found all cuda libraries so that searches for our cross-compilation
+      # libraries work when another cuda sdk is in CMAKE_PREFIX_PATH or
+      # PATH
+      set(_CUDAToolkit_Pop_ROOT_PATH True)
+      break()
+    endif()
+  endforeach()
+endif()
+
+ #If not already set we simply use the toolkit root
+if(NOT CUDAToolkit_TARGET_DIR)
+  set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}")
+endif()
+
+# Determine windows search path suffix for libraries
+if(CMAKE_HOST_SYSTEM_NAME STREQUAL "Windows")
+  if(CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "AMD64")
+    set(_CUDAToolkit_win_search_dirs lib/x64)
+    set(_CUDAToolkit_win_stub_search_dirs lib/x64/stubs)
+  endif()
+endif()
+
+# We don't need to verify the cuda_runtime header when we are using `nvcc` include paths
+# as the compiler being enabled means the header was found
+if(NOT CUDAToolkit_INCLUDE_DIRECTORIES)
+  # Otherwise use CUDAToolkit_TARGET_DIR to guess where the `cuda_runtime.h` is located
+  # On a scattered installation /usr, on a non-scattered something like /usr/local/cuda or /usr/local/cuda-10.2/targets/aarch64-linux.
+  if(EXISTS "${CUDAToolkit_TARGET_DIR}/include/cuda_runtime.h")
+    set(CUDAToolkit_INCLUDE_DIRECTORIES "${CUDAToolkit_TARGET_DIR}/include")
+  else()
+    message(STATUS "Unable to find cuda_runtime.h in \"${CUDAToolkit_TARGET_DIR}/include\" for CUDAToolkit_INCLUDE_DIRECTORIES.")
+  endif()
+endif()
+
+# The NVHPC layout moves math library headers and libraries to a sibling directory and it could be nested under
+# the version of the CUDA toolchain
+# Create a separate variable so this directory can be selectively added to math targets.
+find_path(CUDAToolkit_CUBLAS_INCLUDE_DIR cublas_v2.h PATHS
+  ${CUDAToolkit_INCLUDE_DIRECTORIES}
+  NO_DEFAULT_PATH)
+
+if(NOT CUDAToolkit_CUBLAS_INCLUDE_DIR)
+  file(REAL_PATH "${CUDAToolkit_TARGET_DIR}" CUDAToolkit_MATH_INCLUDE_DIR)
+  cmake_path(APPEND CUDAToolkit_MATH_INCLUDE_DIR "../../math_libs/")
+  if(EXISTS "${CUDAToolkit_MATH_INCLUDE_DIR}/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/")
+    cmake_path(APPEND CUDAToolkit_MATH_INCLUDE_DIR "${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/")
+  endif()
+  cmake_path(APPEND CUDAToolkit_MATH_INCLUDE_DIR "include")
+  cmake_path(NORMAL_PATH CUDAToolkit_MATH_INCLUDE_DIR)
+
+  find_path(CUDAToolkit_CUBLAS_INCLUDE_DIR cublas_v2.h PATHS
+    ${CUDAToolkit_MATH_INCLUDE_DIR}
+    NO_DEFAULT_PATH
+    )
+  if(CUDAToolkit_CUBLAS_INCLUDE_DIR)
+    list(APPEND CUDAToolkit_INCLUDE_DIRECTORIES "${CUDAToolkit_CUBLAS_INCLUDE_DIR}")
+  endif()
+endif()
+unset(CUDAToolkit_CUBLAS_INCLUDE_DIR CACHE)
+unset(CUDAToolkit_CUBLAS_INCLUDE_DIR)
+
+# Find the CUDA Runtime Library libcudart
+find_library(CUDA_CUDART
+  NAMES cudart
+  PATHS ${CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES} ${CUDAToolkit_TARGET_DIR}
+  PATH_SUFFIXES lib64 ${_CUDAToolkit_win_search_dirs}
+)
+find_library(CUDA_CUDART
+  NAMES cudart
+  PATHS ${CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES} ${CUDAToolkit_TARGET_DIR}
+  PATH_SUFFIXES lib64/stubs ${_CUDAToolkit_win_stub_search_dirs} lib/stubs stubs
+)
+
+if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY)
+  message(STATUS "Unable to find cudart library.")
+endif()
+
+#-----------------------------------------------------------------------------
+# Perform version comparison and validate all required variables are set.
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(CUDAToolkit
+  REQUIRED_VARS
+    CUDAToolkit_INCLUDE_DIRECTORIES
+    CUDA_CUDART
+    CUDAToolkit_BIN_DIR
+  VERSION_VAR
+    CUDAToolkit_VERSION
+)
+
+unset(CUDAToolkit_ROOT_DIR)
+mark_as_advanced(CUDA_CUDART
+                 CUDAToolkit_NVCC_EXECUTABLE
+                 CUDAToolkit_SENTINEL_FILE
+                 )
+
+#-----------------------------------------------------------------------------
+# Construct result variables
+if(CUDAToolkit_FOUND)
+  set(CUDAToolkit_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRECTORIES}")
+  get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE)
+
+  # Build search paths without any symlinks
+  file(REAL_PATH "${CUDAToolkit_LIBRARY_DIR}" _cmake_search_dir)
+  set(CUDAToolkit_LIBRARY_SEARCH_DIRS "${_cmake_search_dir}")
+
+  # Detect we are in a splayed nvhpc toolkit layout and add extra
+  # search paths without symlinks
+  #
+  # When the `nvcc` compiler output is parsed we have already resolved
+  # symlinks so we have `cuda/12.X/targets/....` and not `cuda/12.X/lib64`.
+  if(CUDAToolkit_LIBRARY_DIR  MATCHES ".*/cuda/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/(lib64$|targets/)")
+    # Search location for math_libs/
+    block(SCOPE_FOR POLICIES)
+      cmake_policy(SET CMP0152 NEW)
+      file(REAL_PATH "${CUDAToolkit_LIBRARY_DIR}/../../../../../" _cmake_search_dir)
+      list(APPEND CUDAToolkit_LIBRARY_SEARCH_DIRS "${_cmake_search_dir}")
+
+      # Search location for extras like cupti
+      file(REAL_PATH "${CUDAToolkit_LIBRARY_DIR}/../../../" _cmake_search_dir)
+      list(APPEND CUDAToolkit_LIBRARY_SEARCH_DIRS "${_cmake_search_dir}")
+    endblock()
+  endif()
+
+  if(DEFINED CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES)
+    list(APPEND CUDAToolkit_LIBRARY_SEARCH_DIRS "${CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES}")
+  endif()
+
+  # If no `CUDAToolkit_LIBRARY_ROOT` exists set it based on CUDAToolkit_LIBRARY_DIR
+  if(NOT DEFINED CUDAToolkit_LIBRARY_ROOT)
+    foreach(CUDAToolkit_search_loc IN LISTS CUDAToolkit_LIBRARY_DIR CUDAToolkit_BIN_DIR)
+      get_filename_component(CUDAToolkit_possible_lib_root "${CUDAToolkit_search_loc}" DIRECTORY ABSOLUTE)
+      if(EXISTS "${CUDAToolkit_possible_lib_root}/nvvm/")
+        set(CUDAToolkit_LIBRARY_ROOT "${CUDAToolkit_possible_lib_root}")
+        break()
+      endif()
+    endforeach()
+    unset(CUDAToolkit_search_loc)
+    unset(CUDAToolkit_possible_lib_root)
+  endif()
+else()
+  # clear cache results when we fail
+  unset(_cmake_CUDAToolkit_implicit_link_directories CACHE)
+  unset(_cmake_CUDAToolkit_include_directories CACHE)
+  unset(CUDA_CUDART CACHE)
+  unset(CUDAToolkit_BIN_DIR CACHE)
+  unset(CUDAToolkit_NVCC_EXECUTABLE CACHE)
+  unset(CUDAToolkit_SENTINEL_FILE CACHE)
+endif()
+unset(CUDAToolkit_IMPLICIT_LIBRARY_DIRECTORIES)
+unset(CUDAToolkit_INCLUDE_DIRECTORIES)
+
+# CUDAToolkit_LIBRARY_ROOT is accidentally set to the target directory in some environments
+# when the CUDA language is enabled, so patch it out
+if(CUDAToolkit_LIBRARY_ROOT MATCHES "^(.*)/targets/([^/]*)$")
+  set(CUDAToolkit_LIBRARY_ROOT "${CMAKE_MATCH_1}")
+endif()
+
+#-----------------------------------------------------------------------------
+# Construct import targets
+if(CUDAToolkit_FOUND)
+
+  function(_CUDAToolkit_find_and_add_import_lib lib_name)
+    cmake_parse_arguments(arg "" "" "ALT;DEPS;EXTRA_PATH_SUFFIXES;EXTRA_INCLUDE_DIRS;ONLY_SEARCH_FOR;LIBRARY_SEARCH_DIRS" ${ARGN})
+
+    if(NOT arg_LIBRARY_SEARCH_DIRS)
+      set(arg_LIBRARY_SEARCH_DIRS "${CUDAToolkit_LIBRARY_SEARCH_DIRS}")
+    endif()
+
+    if(arg_ONLY_SEARCH_FOR)
+      set(search_names ${arg_ONLY_SEARCH_FOR})
+    else()
+      set(search_names ${lib_name} ${arg_ALT})
+    endif()
+
+    find_library(CUDA_${lib_name}_LIBRARY
+      NAMES ${search_names}
+      HINTS ${arg_LIBRARY_SEARCH_DIRS}
+            ENV CUDA_PATH
+      PATH_SUFFIXES nvidia/current lib64 ${_CUDAToolkit_win_search_dirs} lib
+                    # Support NVHPC splayed math library layout
+                    math_libs/${CUDAToolkit_VERSION_MAJOR}.${CUDAToolkit_VERSION_MINOR}/lib64
+                    math_libs/lib64
+                    ${arg_EXTRA_PATH_SUFFIXES}
+    )
+    # Don't try any stub directories until we have exhausted all other
+    # search locations.
+    set(CUDA_IMPORT_PROPERTY IMPORTED_LOCATION)
+    set(CUDA_IMPORT_TYPE     UNKNOWN)
+    if(NOT CUDA_${lib_name}_LIBRARY)
+      find_library(CUDA_${lib_name}_LIBRARY
+        NAMES ${search_names}
+        HINTS ${arg_LIBRARY_SEARCH_DIRS}
+              ENV CUDA_PATH
+        PATH_SUFFIXES lib64/stubs ${_CUDAToolkit_win_stub_search_dirs} lib/stubs stubs
+      )
+    endif()
+    if(CUDA_${lib_name}_LIBRARY MATCHES "/stubs/" AND NOT CUDA_${lib_name}_LIBRARY MATCHES "\\.a$" AND NOT WIN32)
+      # Use a SHARED library with IMPORTED_IMPLIB, but not IMPORTED_LOCATION,
+      # to indicate that the stub is for linkers but not dynamic loaders.
+      # It will not contribute any RPATH entry.  When encountered as
+      # a private transitive dependency of another shared library,
+      # it will be passed explicitly to linkers so they can find it
+      # even when the runtime library file does not exist on disk.
+      set(CUDA_IMPORT_PROPERTY IMPORTED_IMPLIB)
+      set(CUDA_IMPORT_TYPE     SHARED)
+    endif()
+
+    mark_as_advanced(CUDA_${lib_name}_LIBRARY)
+
+    if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY)
+      add_library(CUDA::${lib_name} ${CUDA_IMPORT_TYPE} IMPORTED)
+      target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+      if(DEFINED CUDAToolkit_MATH_INCLUDE_DIR)
+        string(FIND ${CUDA_${lib_name}_LIBRARY} "math_libs" math_libs)
+        if(NOT ${math_libs} EQUAL -1)
+          target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_MATH_INCLUDE_DIR}")
+        endif()
+      endif()
+      set_property(TARGET CUDA::${lib_name} PROPERTY ${CUDA_IMPORT_PROPERTY} "${CUDA_${lib_name}_LIBRARY}")
+      foreach(dep ${arg_DEPS})
+        if(TARGET CUDA::${dep})
+          target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep})
+        endif()
+      endforeach()
+      if(arg_EXTRA_INCLUDE_DIRS)
+        target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${arg_EXTRA_INCLUDE_DIRS}")
+      endif()
+    endif()
+  endfunction()
+
+  if(NOT TARGET CUDA::toolkit)
+    add_library(CUDA::toolkit IMPORTED INTERFACE)
+    target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+    target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}")
+  endif()
+
+  # setup dependencies that are required for cudart/cudart_static when building
+  # on linux. These are generally only required when using the CUDA toolkit
+  # when CUDA language is disabled
+  if(NOT TARGET CUDA::cudart_static_deps)
+    add_library(CUDA::cudart_static_deps IMPORTED INTERFACE)
+    if(UNIX AND (CMAKE_C_COMPILER_LOADED OR CMAKE_CXX_COMPILER_LOADED))
+      find_package(Threads REQUIRED)
+      target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS})
+    endif()
+
+    if(UNIX AND NOT APPLE AND NOT (CMAKE_SYSTEM_NAME STREQUAL "QNX"))
+      # On Linux, you must link against librt when using the static cuda runtime.
+      find_library(CUDAToolkit_rt_LIBRARY rt)
+      mark_as_advanced(CUDAToolkit_rt_LIBRARY)
+      if(NOT CUDAToolkit_rt_LIBRARY)
+        message(WARNING "Could not find librt library, needed by CUDA::cudart_static")
+      else()
+        target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY})
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda DEPS cudart_static_deps)
+  _CUDAToolkit_find_and_add_import_lib(cudart DEPS cudart_static_deps)
+  _CUDAToolkit_find_and_add_import_lib(cudart_static DEPS cudart_static_deps)
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0.0)
+    _CUDAToolkit_find_and_add_import_lib(nvJitLink)
+    _CUDAToolkit_find_and_add_import_lib(nvJitLink_static DEPS cudart_static_deps nvptxcompiler_static)
+  endif()
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.4.0)
+    _CUDAToolkit_find_and_add_import_lib(nvfatbin DEPS cudart_static_deps)
+    _CUDAToolkit_find_and_add_import_lib(nvfatbin_static DEPS cudart_static_deps)
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library
+  foreach (cuda_lib cublasLt cufft nvjpeg)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib})
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS cudart_static_deps culibos)
+  endforeach()
+  foreach (cuda_lib curand nppc)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib})
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos)
+  endforeach()
+
+  _CUDAToolkit_find_and_add_import_lib(cusparse DEPS nvJitLink)
+  _CUDAToolkit_find_and_add_import_lib(cusparse_static DEPS nvJitLink_static culibos)
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.0.0)
+    # cublas depends on cublasLt
+    # https://docs.nvidia.com/cuda/archive/11.0/cublas#static-library
+    _CUDAToolkit_find_and_add_import_lib(cublas DEPS cublasLt culibos)
+    _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS cublasLt_static culibos)
+  else()
+    _CUDAToolkit_find_and_add_import_lib(cublas DEPS culibos)
+    _CUDAToolkit_find_and_add_import_lib(cublas_static DEPS culibos)
+  endif()
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.4)
+    _CUDAToolkit_find_and_add_import_lib(cuFile ALT cufile DEPS culibos)
+    _CUDAToolkit_find_and_add_import_lib(cuFile_static ALT cufile_static DEPS culibos)
+
+    _CUDAToolkit_find_and_add_import_lib(cuFile_rdma ALT cufile_rdma DEPS cuFile culibos)
+    _CUDAToolkit_find_and_add_import_lib(cuFile_rdma_static ALT cufile_rdma_static DEPS cuFile_static culibos)
+  endif()
+
+    if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.6)
+    _CUDAToolkit_find_and_add_import_lib(cudla)
+  endif()
+
+
+  # cuFFTW depends on cuFFT
+  _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft)
+  _CUDAToolkit_find_and_add_import_lib(cufftw_static DEPS cufft_static)
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 9.2)
+    _CUDAToolkit_find_and_add_import_lib(cufft_static_nocallback DEPS culibos)
+  endif()
+
+  # cuSOLVER depends on cuBLAS, and cuSPARSE
+  set(cusolver_deps cublas cusparse)
+  set(cusolver_static_deps cublas_static cusparse_static culibos)
+  if(CUDAToolkit_VERSION VERSION_GREATER 11.2.1)
+    # cusolver depends on libcusolver_metis and cublasLt
+    # https://docs.nvidia.com/cuda/archive/11.2.2/cusolver#link-dependency
+    list(APPEND cusolver_deps cublasLt)
+    _CUDAToolkit_find_and_add_import_lib(cusolver_metis_static ALT metis_static) # implementation detail static lib
+    list(APPEND cusolver_static_deps cusolver_metis_static cublasLt_static)
+  endif()
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.1.2)
+    # cusolver depends on liblapack_static.a starting with CUDA 10.1 update 2,
+    # https://docs.nvidia.com/cuda/archive/11.5.0/cusolver#static-link-lapack
+    _CUDAToolkit_find_and_add_import_lib(cusolver_lapack_static ALT lapack_static) # implementation detail static lib
+    list(APPEND cusolver_static_deps cusolver_lapack_static)
+  endif()
+  _CUDAToolkit_find_and_add_import_lib(cusolver DEPS ${cusolver_deps})
+  _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS ${cusolver_static_deps})
+  unset(cusolver_deps)
+  unset(cusolver_static_deps)
+
+  # nvGRAPH depends on cuRAND, and cuSOLVER.
+  _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver)
+  _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static)
+
+  # Process the majority of the NPP libraries.
+  foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc)
+    _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static)
+  endforeach()
+
+  find_path(CUDAToolkit_CUPTI_INCLUDE_DIR cupti.h PATHS
+      "${CUDAToolkit_ROOT_DIR}/extras/CUPTI/include"
+      ${CUDAToolkit_INCLUDE_DIRS}
+      PATH_SUFFIXES "../extras/CUPTI/include"
+                    "../../../extras/CUPTI/include"
+      NO_DEFAULT_PATH)
+  mark_as_advanced(CUDAToolkit_CUPTI_INCLUDE_DIR)
+
+  if(CUDAToolkit_CUPTI_INCLUDE_DIR)
+    set(_cmake_cupti_extra_paths extras/CUPTI/lib64/
+                                 extras/CUPTI/lib/
+                                 ../extras/CUPTI/lib64/
+                                 ../extras/CUPTI/lib/
+                                 ../../../extras/CUPTI/lib64/
+                                 ../../../extras/CUPTI/lib/)
+    _CUDAToolkit_find_and_add_import_lib(cupti
+                                        EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                        EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+    _CUDAToolkit_find_and_add_import_lib(cupti_static
+                                        EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                        EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+    if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.2.0)
+      _CUDAToolkit_find_and_add_import_lib(nvperf_host
+                                          EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                          EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+      _CUDAToolkit_find_and_add_import_lib(nvperf_host_static
+                                          EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                          EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+      _CUDAToolkit_find_and_add_import_lib(nvperf_target
+                                          EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                          EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+    endif()
+    if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.3.0)
+      _CUDAToolkit_find_and_add_import_lib(pcsamplingutil
+                                          EXTRA_PATH_SUFFIXES ${_cmake_cupti_extra_paths}
+                                          EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}")
+    endif()
+  endif()
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.1.0)
+    if(NOT TARGET CUDA::nvptxcompiler_static)
+      _CUDAToolkit_find_and_add_import_lib(nvptxcompiler_static)
+      if(TARGET CUDA::nvptxcompiler_static)
+        target_link_libraries(CUDA::nvptxcompiler_static INTERFACE CUDA::cudart_static_deps)
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(nvrtc_builtins ALT nvrtc-builtins)
+  _CUDAToolkit_find_and_add_import_lib(nvrtc)
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 11.5.0)
+    _CUDAToolkit_find_and_add_import_lib(nvrtc_builtins_static ALT nvrtc-builtins_static)
+    if(NOT TARGET CUDA::nvrtc_static)
+      _CUDAToolkit_find_and_add_import_lib(nvrtc_static DEPS nvrtc_builtins_static nvptxcompiler_static)
+      if(TARGET CUDA::nvrtc_static AND WIN32 AND NOT (BORLAND OR MINGW OR CYGWIN))
+        target_link_libraries(CUDA::nvrtc_static INTERFACE Ws2_32.lib)
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml)
+  _CUDAToolkit_find_and_add_import_lib(nvml_static ONLY_SEARCH_FOR libnvidia-ml.a libnvml.a)
+
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.0)
+    # Header-only variant. Uses dlopen().
+    if(NOT TARGET CUDA::nvtx3)
+      add_library(CUDA::nvtx3 INTERFACE IMPORTED)
+      target_include_directories(CUDA::nvtx3 SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}")
+      target_link_libraries(CUDA::nvtx3 INTERFACE ${CMAKE_DL_LIBS})
+    endif()
+  endif()
+  if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.9)
+    if(NOT TARGET CUDA::nvtx3_interop)
+      _CUDAToolkit_find_and_add_import_lib(nvtx3_interop ALT nvtx3interop)
+    endif()
+  endif()
+
+  # nvToolsExt is removed starting in 12.9
+  if(CUDAToolkit_VERSION VERSION_LESS 12.9)
+    if(WIN32)
+      # nvtools can be installed outside the CUDA toolkit directory
+      # so prefer the NVTOOLSEXT_PATH windows only environment variable
+      # In addition on windows the most common name is nvToolsExt64_1
+      find_library(CUDA_nvToolsExt_LIBRARY
+        NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt
+        PATHS ENV NVTOOLSEXT_PATH
+              ENV CUDA_PATH
+        PATH_SUFFIXES lib/x64 lib
+      )
+    endif()
+    _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64)
+
+    if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 10.0)
+      # nvToolsExt is deprecated since nvtx3 introduction.
+      # Warn only if the project requires a sufficiently new CMake to make migration possible.
+      if(TARGET CUDA::nvToolsExt AND CMAKE_MINIMUM_REQUIRED_VERSION VERSION_GREATER_EQUAL 3.25)
+        set_property(TARGET CUDA::nvToolsExt PROPERTY DEPRECATION "nvToolsExt has been superseded by nvtx3 since CUDA 10.0 and CMake 3.25. Use CUDA::nvtx3 and include <nvtx3/nvToolsExt.h> instead.")
+      endif()
+    endif()
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(OpenCL)
+
+  find_program(CUDA_bin2c_EXECUTABLE
+    NAMES bin2c
+    HINTS ${CUDAToolkit_BIN_DIR}
+    NO_DEFAULT_PATH
+  )
+  if(NOT TARGET CUDA::bin2c AND CUDA_bin2c_EXECUTABLE)
+    add_executable(CUDA::bin2c IMPORTED)
+    set_property(TARGET CUDA::bin2c PROPERTY IMPORTED_LOCATION "${CUDA_bin2c_EXECUTABLE}")
+  endif()
+
+  _CUDAToolkit_find_and_add_import_lib(
+    sanitizer
+    ONLY_SEARCH_FOR sanitizer-public
+    EXTRA_PATH_SUFFIXES
+      "../compute-sanitizer"
+      "../../../compute-sanitizer"
+      "../Sanitizer"
+      "../../../Sanitizer"
+      "../extras/Sanitizer"
+      "../../../extras/Sanitizer"
+    EXTRA_INCLUDE_DIRS "${CUDAToolkit_CUPTI_INCLUDE_DIR}"
+  )
+  if(TARGET CUDA::sanitizer)
+    get_property(loc TARGET CUDA::sanitizer PROPERTY IMPORTED_LOCATION)
+    get_filename_component(sanitizer_dir "${loc}" DIRECTORY)
+    target_include_directories(CUDA::sanitizer INTERFACE "${sanitizer_dir}/include")
+  endif()
+endif()
+
+if(_CUDAToolkit_Pop_ROOT_PATH)
+  list(REMOVE_AT CMAKE_FIND_ROOT_PATH 0)
+  unset(_CUDAToolkit_Pop_ROOT_PATH)
+endif()
+
+unset(_CUDAToolkit_win_search_dirs)
+unset(_CUDAToolkit_win_stub_search_dirs)
diff --git a/cpp/tests/utilities/identify_stream_usage.cpp b/cpp/tests/utilities/identify_stream_usage.cpp
index 7c6ba60d52e..61abc4577da 100644
--- a/cpp/tests/utilities/identify_stream_usage.cpp
+++ b/cpp/tests/utilities/identify_stream_usage.cpp
@@ -11,13 +11,13 @@
 #include <cuda_runtime.h>
 
 #include <dlfcn.h>
+#include <generated_cuda_runtime_api_meta.h>
+#include <sanitizer.h>
 
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
 #include <stdexcept>
-#include <string>
-#include <unordered_map>
 
 // This file is compiled into a separate library that is dynamically loaded with LD_PRELOAD at
 // runtime to libcudf to override some stream-related symbols in libcudf. The goal of such a library
@@ -119,234 +119,128 @@ void check_stream_and_error(cudaStream_t stream)
   }
 }
 
-/**
- * @brief Container for CUDA APIs that have been overloaded using DEFINE_OVERLOAD.
- *
- * This variable must be initialized before everything else.
- *
- * @see find_originals for a description of the priorities
- */
-__attribute__((init_priority(1001))) std::unordered_map<std::string, void*> originals;
+class sanitizer_subscriber {
+ public:
+  sanitizer_subscriber();
+  ~sanitizer_subscriber();
 
-/**
- * @brief Macro for generating functions to override existing CUDA functions.
- *
- * Define a new function with the provided signature that checks the used
- * stream and raises an exception if it is one of CUDA's default streams. If
- * not, the new function forwards all arguments to the original function.
- *
- * Note that since this only defines the function, we do not need default
- * parameter values since those will be provided by the original declarations
- * in CUDA itself.
- *
- * @see find_originals for a description of the priorities
- *
- * @param function The function to overload.
- * @param signature The function signature (must include names, not just types).
- * @parameter arguments The function arguments (names only, no types).
- */
-#define DEFINE_OVERLOAD(function, signature, arguments)     \
-  using function##_t = cudaError_t (*)(signature);          \
-                                                            \
-  cudaError_t function(signature)                           \
-  {                                                         \
-    check_stream_and_error(stream);                         \
-    return ((function##_t)originals[#function])(arguments); \
-  }                                                         \
-  __attribute__((constructor(1002))) void queue_##function() { originals[#function] = nullptr; }
+ private:
+  Sanitizer_SubscriberHandle handle;
 
-/**
- * @brief Helper macro to define macro arguments that contain a comma.
- */
-#define ARG(...) __VA_ARGS__
+  static void check_result(SanitizerResult result);
 
-// clang-format off
-/*
-   We need to overload all the functions from the runtime API (assuming that we
-   don't use the driver API) that accept streams. The main webpage for APIs is
-   https://docs.nvidia.com/cuda/cuda-runtime-api/modules.html#modules. Here are
-   the modules containing any APIs using streams as of 9/20/2022:
-   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
-   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT - Done
-   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXTRES__INTEROP.html#group__CUDART__EXTRES__INTEROP
-   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html#group__CUDART__EXECUTION - Done
-   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY - Done
-   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html#group__CUDART__MEMORY__POOLS - Done
-   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OPENGL__DEPRECATED.html#group__CUDART__OPENGL__DEPRECATED
-   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EGL.html#group__CUDART__EGL
-   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP
-   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH
-   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__HIGHLEVEL.html#group__CUDART__HIGHLEVEL
- */
-// clang-format on
-
-// Event APIS:
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT
-DEFINE_OVERLOAD(cudaEventRecord, ARG(cudaEvent_t event, cudaStream_t stream), ARG(event, stream));
-
-DEFINE_OVERLOAD(cudaEventRecordWithFlags,
-                ARG(cudaEvent_t event, cudaStream_t stream, unsigned int flags),
-                ARG(event, stream, flags));
-
-// Execution APIS:
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html#group__CUDART__EXECUTION
-DEFINE_OVERLOAD(cudaLaunchKernel,
-                ARG(void const* func,
-                    dim3 gridDim,
-                    dim3 blockDim,
-                    void** args,
-                    size_t sharedMem,
-                    cudaStream_t stream),
-                ARG(func, gridDim, blockDim, args, sharedMem, stream));
+  template <typename Args, cudaStream_t Args::* Field>
+  static void check_stream_arg(const Sanitizer_CallbackData* cbdata);
 
-#if CUDART_VERSION >= 13000
-// We need to define the __cudaLaunchKernel ABI as
-// it isn't part of cuda_runtime.h when compiling as a C++ source
-extern "C" cudaError_t CUDARTAPI __cudaLaunchKernel(cudaKernel_t kernel,
-                                                    dim3 gridDim,
-                                                    dim3 blockDim,
-                                                    void** args,
-                                                    size_t sharedMem,
-                                                    cudaStream_t stream);
-extern "C" cudaError_t CUDARTAPI __cudaLaunchKernel_ptsz(cudaKernel_t kernel,
-                                                         dim3 gridDim,
-                                                         dim3 blockDim,
-                                                         void** args,
-                                                         size_t sharedMem,
-                                                         cudaStream_t stream);
-DEFINE_OVERLOAD(__cudaLaunchKernel,
-                ARG(cudaKernel_t kernel,
-                    dim3 gridDim,
-                    dim3 blockDim,
-                    void** args,
-                    size_t sharedMem,
-                    cudaStream_t stream),
-                ARG(kernel, gridDim, blockDim, args, sharedMem, stream));
-DEFINE_OVERLOAD(__cudaLaunchKernel_ptsz,
-                ARG(cudaKernel_t kernel,
-                    dim3 gridDim,
-                    dim3 blockDim,
-                    void** args,
-                    size_t sharedMem,
-                    cudaStream_t stream),
-                ARG(kernel, gridDim, blockDim, args, sharedMem, stream));
-#endif
+  void callback(Sanitizer_CallbackDomain domain, Sanitizer_CallbackId cbid, const void* cbdata);
+};
+
+sanitizer_subscriber::sanitizer_subscriber()
+{
+  const auto cb = [](void* userdata,
+                     Sanitizer_CallbackDomain domain,
+                     Sanitizer_CallbackId cbid,
+                     const void* cbdata) {
+    auto* subscriber = static_cast<sanitizer_subscriber*>(userdata);
+    subscriber->callback(domain, cbid, cbdata);
+  };
+  check_result(sanitizerSubscribe(&this->handle, cb, this));
+
+  check_result(sanitizerEnableDomain(1, this->handle, SANITIZER_CB_DOMAIN_RUNTIME_API));
+}
 
-DEFINE_OVERLOAD(cudaLaunchCooperativeKernel,
-                ARG(void const* func,
-                    dim3 gridDim,
-                    dim3 blockDim,
-                    void** args,
-                    size_t sharedMem,
-                    cudaStream_t stream),
-                ARG(func, gridDim, blockDim, args, sharedMem, stream));
-DEFINE_OVERLOAD(cudaLaunchHostFunc,
-                ARG(cudaStream_t stream, cudaHostFn_t fn, void* userData),
-                ARG(stream, fn, userData));
-
-// Memory transfer APIS:
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
+sanitizer_subscriber::~sanitizer_subscriber() { check_result(sanitizerUnsubscribe(this->handle)); }
+
+void sanitizer_subscriber::check_result(SanitizerResult result)
+{
+  if (result != SANITIZER_SUCCESS) {
+    const char* str;
+    sanitizerGetResultString(result, &str);
+    throw std::runtime_error(std::string("Sanitizer error: ") + str);
+  }
+}
+
+template <typename Args, cudaStream_t Args::* Field>
+void sanitizer_subscriber::check_stream_arg(const Sanitizer_CallbackData* cbdata)
+{
+  const auto* args = static_cast<const Args*>(cbdata->functionParams);
+  check_stream_and_error(args->*Field);
+}
+
+// `generated_cuda_runtime_api_meta.h` is provided by the CUDA Toolkit/Compute Sanitizer.
+// It defines versioned callback parameter structs named like
+// `cudaMemcpyAsync_v3020_params`, where the numeric suffix identifies the CUDA runtime
+// API version associated with that parameter layout.
+#define CHECK_STREAM_ARG(call, version, field)                \
+  case SANITIZER_CBID_RUNTIME_API_##call: {                   \
+    using args_t = call##_v##version##_params;                \
+    check_stream_arg<args_t, &args_t::field>(runtime_cbdata); \
+  } break
+
+void sanitizer_subscriber::callback(Sanitizer_CallbackDomain domain,
+                                    Sanitizer_CallbackId cbid,
+                                    const void* cbdata)
+{
+  switch (domain) {
+    case SANITIZER_CB_DOMAIN_RUNTIME_API: {
+      const auto* runtime_cbdata = static_cast<const Sanitizer_CallbackData*>(cbdata);
+
+      if (runtime_cbdata->callbackSite == SANITIZER_API_ENTER) {
+        switch (cbid) {
+          CHECK_STREAM_ARG(cudaEventRecord, 3020, stream);
+          CHECK_STREAM_ARG(cudaEventRecord_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaEventRecordWithFlags, 11010, stream);
+          CHECK_STREAM_ARG(cudaEventRecordWithFlags_ptsz, 11010, stream);
+          CHECK_STREAM_ARG(cudaLaunchKernel, 7000, stream);
+          CHECK_STREAM_ARG(cudaLaunchKernel_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaLaunchCooperativeKernel, 9000, stream);
+          CHECK_STREAM_ARG(cudaLaunchCooperativeKernel_ptsz, 9000, stream);
+          CHECK_STREAM_ARG(cudaLaunchHostFunc, 10000, stream);
+          CHECK_STREAM_ARG(cudaLaunchHostFunc_ptsz, 10000, stream);
 #if CUDART_VERSION >= 13000
-DEFINE_OVERLOAD(
-  cudaMemPrefetchAsync,
-  ARG(void const* devPtr, size_t count, cudaMemLocation loc, int flags, cudaStream_t stream),
-  ARG(devPtr, count, loc, flags, stream));
+          CHECK_STREAM_ARG(cudaMemPrefetchAsync, 12020, stream);
+          CHECK_STREAM_ARG(cudaMemPrefetchAsync_ptsz, 12020, stream);
 #else
-DEFINE_OVERLOAD(cudaMemPrefetchAsync,
-                ARG(void const* devPtr, size_t count, int dstDevice, cudaStream_t stream),
-                ARG(devPtr, count, dstDevice, stream));
+          CHECK_STREAM_ARG(cudaMemPrefetchAsync, 8000, stream);
+          CHECK_STREAM_ARG(cudaMemPrefetchAsync_ptsz, 8000, stream);
+          CHECK_STREAM_ARG(cudaMemPrefetchAsync_v2, 12020, stream);
+          CHECK_STREAM_ARG(cudaMemPrefetchAsync_v2_ptsz, 12020, stream);
 #endif
-DEFINE_OVERLOAD(cudaMemcpy2DAsync,
-                ARG(void* dst,
-                    size_t dpitch,
-                    void const* src,
-                    size_t spitch,
-                    size_t width,
-                    size_t height,
-                    cudaMemcpyKind kind,
-                    cudaStream_t stream),
-                ARG(dst, dpitch, src, spitch, width, height, kind, stream));
-DEFINE_OVERLOAD(cudaMemcpy2DFromArrayAsync,
-                ARG(void* dst,
-                    size_t dpitch,
-                    cudaArray_const_t src,
-                    size_t wOffset,
-                    size_t hOffset,
-                    size_t width,
-                    size_t height,
-                    cudaMemcpyKind kind,
-                    cudaStream_t stream),
-                ARG(dst, dpitch, src, wOffset, hOffset, width, height, kind, stream));
-DEFINE_OVERLOAD(cudaMemcpy2DToArrayAsync,
-                ARG(cudaArray_t dst,
-                    size_t wOffset,
-                    size_t hOffset,
-                    void const* src,
-                    size_t spitch,
-                    size_t width,
-                    size_t height,
-                    cudaMemcpyKind kind,
-                    cudaStream_t stream),
-                ARG(dst, wOffset, hOffset, src, spitch, width, height, kind, stream));
-DEFINE_OVERLOAD(cudaMemcpy3DAsync,
-                ARG(cudaMemcpy3DParms const* p, cudaStream_t stream),
-                ARG(p, stream));
-DEFINE_OVERLOAD(cudaMemcpy3DPeerAsync,
-                ARG(cudaMemcpy3DPeerParms const* p, cudaStream_t stream),
-                ARG(p, stream));
-DEFINE_OVERLOAD(
-  cudaMemcpyAsync,
-  ARG(void* dst, void const* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream),
-  ARG(dst, src, count, kind, stream));
-DEFINE_OVERLOAD(cudaMemcpyFromSymbolAsync,
-                ARG(void* dst,
-                    void const* symbol,
-                    size_t count,
-                    size_t offset,
-                    cudaMemcpyKind kind,
-                    cudaStream_t stream),
-                ARG(dst, symbol, count, offset, kind, stream));
-DEFINE_OVERLOAD(cudaMemcpyToSymbolAsync,
-                ARG(void const* symbol,
-                    void const* src,
-                    size_t count,
-                    size_t offset,
-                    cudaMemcpyKind kind,
-                    cudaStream_t stream),
-                ARG(symbol, src, count, offset, kind, stream));
-DEFINE_OVERLOAD(
-  cudaMemset2DAsync,
-  ARG(void* devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream),
-  ARG(devPtr, pitch, value, width, height, stream));
-DEFINE_OVERLOAD(
-  cudaMemset3DAsync,
-  ARG(cudaPitchedPtr pitchedDevPtr, int value, cudaExtent extent, cudaStream_t stream),
-  ARG(pitchedDevPtr, value, extent, stream));
-DEFINE_OVERLOAD(cudaMemsetAsync,
-                ARG(void* devPtr, int value, size_t count, cudaStream_t stream),
-                ARG(devPtr, value, count, stream));
-
-// Memory allocation APIS:
-// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html#group__CUDART__MEMORY__POOLS
-DEFINE_OVERLOAD(cudaFreeAsync, ARG(void* devPtr, cudaStream_t stream), ARG(devPtr, stream));
-DEFINE_OVERLOAD(cudaMallocAsync,
-                ARG(void** devPtr, size_t size, cudaStream_t stream),
-                ARG(devPtr, size, stream));
-DEFINE_OVERLOAD(cudaMallocFromPoolAsync,
-                ARG(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream),
-                ARG(ptr, size, memPool, stream));
-
-/**
- * @brief Function to collect all the original CUDA symbols corresponding to overloaded functions.
- *
- * Note on priorities:
- * - `originals` must be initialized first, so it is 1001.
- * - The function names must be added to originals next in the macro, so those are 1002.
- * - Finally, this function actually finds the original symbols so it is 1003.
- */
-__attribute__((constructor(1003))) void find_originals()
-{
-  for (auto it : originals) {
-    originals[it.first] = dlsym(RTLD_NEXT, it.first.data());
+          CHECK_STREAM_ARG(cudaMemcpy2DAsync, 3020, stream);
+          CHECK_STREAM_ARG(cudaMemcpy2DAsync_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaMemcpy2DFromArrayAsync, 3020, stream);
+          CHECK_STREAM_ARG(cudaMemcpy2DFromArrayAsync_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaMemcpy2DToArrayAsync, 3020, stream);
+          CHECK_STREAM_ARG(cudaMemcpy2DToArrayAsync_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaMemcpy3DAsync, 3020, stream);
+          CHECK_STREAM_ARG(cudaMemcpy3DAsync_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaMemcpy3DPeerAsync, 4000, stream);
+          CHECK_STREAM_ARG(cudaMemcpy3DPeerAsync_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaMemcpyAsync, 3020, stream);
+          CHECK_STREAM_ARG(cudaMemcpyAsync_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaMemcpyFromSymbolAsync, 3020, stream);
+          CHECK_STREAM_ARG(cudaMemcpyFromSymbolAsync_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaMemcpyToSymbolAsync, 3020, stream);
+          CHECK_STREAM_ARG(cudaMemcpyToSymbolAsync_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaMemset2DAsync, 3020, stream);
+          CHECK_STREAM_ARG(cudaMemset2DAsync_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaMemset3DAsync, 3020, stream);
+          CHECK_STREAM_ARG(cudaMemset3DAsync_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaMemsetAsync, 3020, stream);
+          CHECK_STREAM_ARG(cudaMemsetAsync_ptsz, 7000, stream);
+          CHECK_STREAM_ARG(cudaFreeAsync, 11020, hStream);
+          CHECK_STREAM_ARG(cudaFreeAsync_ptsz, 11020, hStream);
+          CHECK_STREAM_ARG(cudaMallocAsync, 11020, hStream);
+          CHECK_STREAM_ARG(cudaMallocAsync_ptsz, 11020, hStream);
+          CHECK_STREAM_ARG(cudaMallocFromPoolAsync, 11020, stream);
+          CHECK_STREAM_ARG(cudaMallocFromPoolAsync_ptsz, 11020, stream);
+        }
+      }
+    } break;
+    default: break;
   }
 }
+
+#undef CHECK_STREAM_ARG
+
+sanitizer_subscriber subscriber;
diff --git a/dependencies.yaml b/dependencies.yaml
index 77b4b456b08..cae4816eec5 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -147,13 +147,6 @@ files:
       - develop
       - iwyu
       - py_version
-  stream_tests:
-    output: none
-    includes:
-      - build_all
-      - build_base
-      - cuda
-      - cuda_version
   docs:
     output: none
     includes:
@@ -657,6 +650,7 @@ dependencies:
           - cuda-cudart-dev
           - cuda-nvrtc-dev
           - cuda-nvtx-dev
+          - cuda-sanitizer-api
           - libcufile-dev
           - libcurand-dev
           - libnvjitlink-dev
diff --git a/python/pylibcudf/tests/conftest.py b/python/pylibcudf/tests/conftest.py
index 9b5638a4621..dc78c748e63 100644
--- a/python/pylibcudf/tests/conftest.py
+++ b/python/pylibcudf/tests/conftest.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 # Tell ruff it's OK that some imports occur after the sys.path.insert
 # ruff: noqa: E402
@@ -363,3 +363,16 @@ def has_nulls(request):
 )
 def has_nans(request):
     return request.param
+
+
+@pytest.fixture(scope="session")
+def patch_cupy_stream(request):
+    import cupy as cp
+
+    # TODO: Remove this version conditional once we require CuPy 14
+    if hasattr(cp.cuda.Stream, "from_external"):
+        return cp.cuda.Stream.from_external(plc.utils.CUDF_DEFAULT_STREAM)
+    else:
+        version, stream_ptr = plc.utils.CUDF_DEFAULT_STREAM.__cuda_stream__()
+        assert version == 0
+        return cp.cuda.ExternalStream(stream_ptr)
diff --git a/python/pylibcudf/tests/test_column_from_array.py b/python/pylibcudf/tests/test_column_from_array.py
index 31cf54864b3..39a4fff011f 100644
--- a/python/pylibcudf/tests/test_column_from_array.py
+++ b/python/pylibcudf/tests/test_column_from_array.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 import pyarrow as pa
@@ -50,7 +50,7 @@ def shape(request):
 
 
 @pytest.fixture(params=CUPY_DTYPES, ids=repr)
-def cp_array(request, shape):
+def cp_array(request, shape, patch_cupy_stream):
     dtype = request.param
     size = np.prod(shape)
     if dtype == np.bool_:
@@ -59,7 +59,8 @@ def cp_array(request, shape):
         ).reshape(shape)
     else:
         np_arr = np.arange(size, dtype=dtype).reshape(shape)
-    return cp.asarray(np_arr), np_arr
+    with patch_cupy_stream:
+        return cp.asarray(np_arr), np_arr
 
 
 @pytest.fixture(params=NUMPY_DTYPES, ids=repr)
@@ -104,16 +105,18 @@ def test_from_numpy_array(np_array):
     assert_column_eq(expected, got)
 
 
-def test_non_c_contiguous_raises(cp_array):
+def test_non_c_contiguous_raises(cp_array, patch_cupy_stream):
     cp_arr = cp_array[0]
     if len(cp_arr.shape) == 1:
         return
 
+    with patch_cupy_stream:
+        fortran_arr = cp.asfortranarray(cp_arr)
     with pytest.raises(
         ValueError,
         match="Data must be C-contiguous",
     ):
-        plc.Column.from_array(cp.asfortranarray(cp_arr))
+        plc.Column.from_array(fortran_arr)
 
 
 def test_row_limit_exceed_raises():
@@ -134,11 +137,12 @@ def __init__(self, shape):
         plc.Column.from_array(Foo((SIZE_TYPE_LIMIT, 1)))
 
 
-def test_flat_size_exceeds_size_type_limit():
+def test_flat_size_exceeds_size_type_limit(patch_cupy_stream):
     nrows = 2**16
     ncols = (SIZE_TYPE_LIMIT // nrows) + 1
 
-    arr = cp.zeros((nrows, ncols), dtype=np.int32)
+    with patch_cupy_stream:
+        arr = cp.zeros((nrows, ncols), dtype=np.int32)
 
     with pytest.raises(
         ValueError,
@@ -191,8 +195,9 @@ def test_from_zero_dimensional_array():
         ([[], []], np.int32, pa.array([[], []], type=pa.list_(pa.int32()))),
     ],
 )
-def test_empty_array(np_or_cp_array, arr, dtype, expect):
-    arr = np_or_cp_array(arr, dtype=dtype)
+def test_empty_array(np_or_cp_array, arr, dtype, expect, patch_cupy_stream):
+    with patch_cupy_stream:
+        arr = np_or_cp_array(arr, dtype=dtype)
     col = plc.Column.from_array(arr)
 
     assert_column_eq(expect, col)
diff --git a/python/pylibcudf/tests/test_interop.py b/python/pylibcudf/tests/test_interop.py
index a21ed0277b0..b7ce9547f8d 100644
--- a/python/pylibcudf/tests/test_interop.py
+++ b/python/pylibcudf/tests/test_interop.py
@@ -143,8 +143,9 @@ def test_round_trip_dlpack_plc_table():
 
 
 @pytest.mark.parametrize("array", [np.array, cp.array])
-def test_round_trip_dlpack_array(array):
-    arr = array([1, 2, 3])
+def test_round_trip_dlpack_array(array, patch_cupy_stream):
+    with patch_cupy_stream:
+        arr = array([1, 2, 3])
     result = plc.interop.from_dlpack(arr.__dlpack__())
     expected = pa.table({"a": [1, 2, 3]})
     assert_table_eq(expected, result)
diff --git a/python/pylibcudf/tests/test_reshape.py b/python/pylibcudf/tests/test_reshape.py
index 1fb406d9719..e0c50542de7 100644
--- a/python/pylibcudf/tests/test_reshape.py
+++ b/python/pylibcudf/tests/test_reshape.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 import cupy as cp
@@ -50,7 +50,7 @@ def test_tile(reshape_data, cnt):
         ("float64", TypeId.FLOAT64),
     ],
 )
-def test_table_to_array(dtype, type_id):
+def test_table_to_array(dtype, type_id, patch_cupy_stream):
     arrow_type = pa.from_numpy_dtype(getattr(cp, dtype))
     arrs = [
         pa.array([1, 2, 3], type=arrow_type),
@@ -68,5 +68,6 @@ def test_table_to_array(dtype, type_id):
         got.nbytes,
     )
 
-    expect = cp.array([[1, 4], [2, 5], [3, 6]], dtype=dtype)
-    cp.testing.assert_array_equal(expect, got)
+    with patch_cupy_stream:
+        expect = cp.array([[1, 4], [2, 5], [3, 6]], dtype=dtype)
+        cp.testing.assert_array_equal(expect, got)

From 3700502e078c204124dda09ee7904f4ddb8f676d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 6 May 2026 12:00:17 -0500
Subject: [PATCH 19/36] Run all nvbench benchmarks with timeout in smoketest
 (#20538)

Updates the benchmark smoketest script to run all nvbench benchmarks. This should catch any invalid benchmarks and issues like segfaults that occur at runtime.

We set a 1-minute timeout in case of hangs, since these are smoke tests. The `--profile` flag ensures only a quick one-off run is used.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/20538
---
 .github/workflows/test.yaml         | 12 ++++++++++++
 ci/run_cudf_benchmark_smoketests.sh | 24 +++++++++++++++++++++---
 ci/test_cpp.sh                      |  7 -------
 ci/test_cpp_benchmarks.sh           | 13 +++++++++++++
 4 files changed, 46 insertions(+), 10 deletions(-)
 create mode 100755 ci/test_cpp_benchmarks.sh

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 95439e65744..b4977f60def 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -39,6 +39,18 @@ jobs:
       date: ${{ inputs.date }}
       script: ci/test_cpp.sh
       sha: ${{ inputs.sha }}
+  conda-cpp-benchmark-tests:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
+    with:
+      build_type: ${{ inputs.build_type }}
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      node_type: "gpu-l4-latest-1"
+      arch: "amd64"
+      container_image: "rapidsai/ci-conda:26.06-latest"
+      script: ci/test_cpp_benchmarks.sh
   conda-cpp-memcheck-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@main
diff --git a/ci/run_cudf_benchmark_smoketests.sh b/ci/run_cudf_benchmark_smoketests.sh
index 8069e60bd82..c9e9873a6f3 100755
--- a/ci/run_cudf_benchmark_smoketests.sh
+++ b/ci/run_cudf_benchmark_smoketests.sh
@@ -21,6 +21,24 @@ else
     exit 1
 fi
 
-# Ensure that benchmarks are runnable
-# Run a small nvbench benchmark
-./STRINGS_NVBENCH --profile --benchmark 0 --devices 0
+EXITCODE=0
+# Run all nvbench benchmarks with --profile and rmm_mode=cuda
+for bench in *_NVBENCH; do
+  if [[ -x "$bench" && -f "$bench" ]]; then
+    start_time=$(date +%s)
+    echo "Running $bench with --profile..."
+    "./$bench" --profile --devices 0 -q --rmm_mode cuda
+    SUITEERROR=$?
+    end_time=$(date +%s)
+    duration=$((end_time - start_time))
+    if (( SUITEERROR == 0 )); then
+      echo "Benchmark $bench passed in $duration seconds"
+    else
+      echo "Benchmark $bench failed in $duration seconds: $SUITEERROR"
+      EXITCODE=$SUITEERROR
+    fi
+  fi
+done
+
+echo "Test script exiting with value: $EXITCODE"
+exit ${EXITCODE}
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index f548cc0a9e8..4ae71ff3081 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -32,12 +32,5 @@ if (( SUITEERROR == 0 )); then
     SUITEERROR=$?
 fi
 
-# Ensure that benchmarks are runnable
-if (( SUITEERROR == 0 )); then
-    rapids-logger "Run tests of libcudf benchmarks"
-    timeout 30m ./ci/run_cudf_benchmark_smoketests.sh
-    SUITEERROR=$?
-fi
-
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/test_cpp_benchmarks.sh b/ci/test_cpp_benchmarks.sh
new file mode 100755
index 00000000000..a2f34c047d8
--- /dev/null
+++ b/ci/test_cpp_benchmarks.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
+set -euo pipefail
+
+# Support invoking test_cpp_benchmarks.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../
+
+source ./ci/test_cpp_common.sh
+
+rapids-logger "Run tests of libcudf benchmarks"
+./ci/run_cudf_benchmark_smoketests.sh

From f0b2a424bb6e55609de4645b3ce9a3a89eb78eda Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 6 May 2026 10:35:34 -0700
Subject: [PATCH 20/36] Fix a crash in the ORC reader with malformed stripe
 footers (#22383)

`read_orc` segfaults on malformed ORC files whose stripe footer's `ColumnEncoding` array has fewer entries than the file footer's type count.

This PR adds a check of `stripe_footer.columns.size()` against `ff.types.size()` to avoid OOB access to the stripe footer.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/cudf/pull/22383
---
 cpp/src/io/orc/aggregate_orc_metadata.cpp         |   5 +++++
 .../tests/data/orc/stripe_footer_no_encodings.orc | Bin 0 -> 597 bytes
 python/cudf/cudf/tests/input_output/test_orc.py   |  10 ++++++++++
 3 files changed, 15 insertions(+)
 create mode 100644 python/cudf/cudf/tests/data/orc/stripe_footer_no_encodings.orc

diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index 8da5c355190..a4522461777 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -267,6 +267,11 @@ aggregate_orc_metadata::select_stripes(
         {buffer->data(), buffer->size()});
       protobuf_reader(sf_data.data(), sf_data.size())
         .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
+      auto const& stripe_footer = per_file_metadata[mapping.source_idx].stripefooters[i];
+      auto const num_types      = per_file_metadata[mapping.source_idx].ff.types.size();
+      CUDF_EXPECTS(stripe_footer.columns.size() >= num_types,
+                   "Invalid ColumnEncoding field in a stripe footer.",
+                   std::out_of_range);
       mapping.stripe_info[i].stripe_footer =
         &per_file_metadata[mapping.source_idx].stripefooters[i];
       if (stripe->indexLength == 0) { row_grp_idx_present = false; }
diff --git a/python/cudf/cudf/tests/data/orc/stripe_footer_no_encodings.orc b/python/cudf/cudf/tests/data/orc/stripe_footer_no_encodings.orc
new file mode 100644
index 0000000000000000000000000000000000000000..70ecbdc70529b6ea665e13d6accb40100ba1c516
GIT binary patch
literal 597
zcmZ9KK}*9h7>1K}o2B$nib&aM<TOM}=PqL)2m|rt#?X^U$<{ckZCcV)Eu|N)Ui?Mg
z{XgC{NvEz}n)iFZ<jvEsoiAsmW&+bOH2@sTfD7RC3}6End;zm(^n~iD6}tOkg*J3G
zL$^=+rg*xpHgJa!S%&36*zX4GH1<))fDSnP(9K?kX^3;@U9V6?L-H*9MM85l9o$st
zL3lzThUtc&qXD8gA;O?V9E&RFm~m11N~f8SBPEHF^{ptHyg-GMZEQQ`E+TOlw7mu^
z;AJMSLj{f2Y1j}HNml+g($zg#b4ivEjzi3`*j^z*#rBA9<k{=1^WzDscVOBSs2Pcf
zXo39M|5zf*Cu$TekRNvkLk>yZiaBS|s^IGNt6C0)5zJGjSmS8LFe@c$R$|7|Cz<|8
zSRSQRx+i1L_eMyb+oJ3?y|Fh&wGXG_n=Zh#^?#hbU0!sJ*NzR42CU(MzxVQ~>+|Jo
F{~OQGpZ)*<

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/input_output/test_orc.py b/python/cudf/cudf/tests/input_output/test_orc.py
index 42af9926f7f..b8f6ee0395a 100644
--- a/python/cudf/cudf/tests/input_output/test_orc.py
+++ b/python/cudf/cudf/tests/input_output/test_orc.py
@@ -584,6 +584,16 @@ def test_orc_read_incorrect_ps_length():
         cudf.read_orc(buf)
 
 
+def test_orc_read_stripe_footer_no_encodings(datadir):
+    # Crafted ORC whose stripe footer's ColumnEncoding list is empty even though
+    # the file footer declares one data column. The reader used to index the
+    # encoding list out of bounds and segfault; it now raises IndexError from
+    # the early stripe-footer validation in aggregate_orc_metadata.
+    path = datadir / "stripe_footer_no_encodings.orc"
+    with pytest.raises(IndexError):
+        cudf.read_orc(path)
+
+
 def test_orc_reader_tzif_timestamps(datadir):
     # Contains timstamps in the range covered by the TZif file
     # Other timedate tests only cover "future" times

From df9ea2470dd5cf28ec017c978877024a781620e1 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <12716979+PointKernel@users.noreply.github.com>
Date: Wed, 6 May 2026 12:11:52 -0700
Subject: [PATCH 21/36] Rename build/probe to right/left in hash_join and
 distinct_hash_join (#22382)

There have been prior discussions about unifying the join interfaces to avoid, or at least reduce, the mixed use of probe/build and left/right terminology, which can be confusing.

This is the first PR in a series that renames join operations to replace the probe/build terminology with left/right. The probe/build roles are not deterministic and can vary depending on the algorithm, whereas left/right provides a consistent and unambiguous reference, minimizing confusion. This PR applies the renaming to `hash_join` and `distinct_hash_join`. There are no functional changes, only naming updates.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Lawrence Mitchell (https://github.com/wence-)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/22382
---
 cpp/benchmarks/join/distinct_join.cu          |  16 +-
 .../cudf/detail/join/distinct_hash_join.cuh   |  36 ++--
 cpp/include/cudf/detail/join/hash_join.hpp    |  50 ++---
 cpp/include/cudf/join/distinct_hash_join.hpp  |  40 ++--
 cpp/include/cudf/join/hash_join.hpp           | 142 +++++++-------
 cpp/src/join/distinct_hash_join.cu            | 158 ++++++++--------
 cpp/src/join/hash_join/common.cuh             |   8 +-
 cpp/src/join/hash_join/dispatch.cuh           |  28 +--
 .../hash_join/full_join_match_context.cpp     |   7 +-
 cpp/src/join/hash_join/full_join_retrieve.cu  |   6 +-
 cpp/src/join/hash_join/full_join_size.cu      |   6 +-
 cpp/src/join/hash_join/full_join_size_impl.cu |  28 +--
 cpp/src/join/hash_join/hash_join.cu           | 108 +++++------
 .../hash_join/inner_join_match_context.cpp    |   7 +-
 cpp/src/join/hash_join/inner_join_retrieve.cu |   6 +-
 cpp/src/join/hash_join/inner_join_size.cu     |   6 +-
 .../hash_join/left_join_match_context.cpp     |   7 +-
 cpp/src/join/hash_join/left_join_retrieve.cu  |   6 +-
 cpp/src/join/hash_join/left_join_size.cu      |   6 +-
 cpp/src/join/hash_join/match_context.cu       |  33 ++--
 cpp/src/join/hash_join/retrieve_impl.cuh      |  74 ++++----
 cpp/src/join/hash_join/size_impl.cuh          |  74 ++++----
 cpp/tests/join/distinct_join_tests.cpp        | 176 +++++++++---------
 cpp/tests/join/join_tests.cpp                 |  24 +--
 24 files changed, 524 insertions(+), 528 deletions(-)

diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu
index 6fe8928128c..3f656e3423c 100644
--- a/cpp/benchmarks/join/distinct_join.cu
+++ b/cpp/benchmarks/join/distinct_join.cu
@@ -18,11 +18,11 @@ void nvbench_distinct_inner_join(nvbench::state& state,
 {
   auto dtypes = cycle_dtypes(get_type_or_group(static_cast<int32_t>(DataType)), num_keys);
 
-  auto join = [](cudf::table_view const& probe_input,
-                 cudf::table_view const& build_input,
+  auto join = [](cudf::table_view const& left_input,
+                 cudf::table_view const& right_input,
                  cudf::null_equality compare_nulls) {
-    auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls, LOAD_FACTOR};
-    return hj_obj.inner_join(probe_input);
+    auto hj_obj = cudf::distinct_hash_join{right_input, compare_nulls, LOAD_FACTOR};
+    return hj_obj.inner_join(left_input);
   };
 
   BM_join<Nullable, join_t::HASH, NullEquality>(state, dtypes, join);
@@ -36,11 +36,11 @@ void nvbench_distinct_left_join(nvbench::state& state,
 {
   auto dtypes = cycle_dtypes(get_type_or_group(static_cast<int32_t>(DataType)), num_keys);
 
-  auto join = [](cudf::table_view const& probe_input,
-                 cudf::table_view const& build_input,
+  auto join = [](cudf::table_view const& left_input,
+                 cudf::table_view const& right_input,
                  cudf::null_equality compare_nulls) {
-    auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls, LOAD_FACTOR};
-    return hj_obj.left_join(probe_input);
+    auto hj_obj = cudf::distinct_hash_join{right_input, compare_nulls, LOAD_FACTOR};
+    return hj_obj.left_join(left_input);
   };
 
   BM_join<Nullable, join_t::HASH, NullEquality>(state, dtypes, join);
diff --git a/cpp/include/cudf/detail/join/distinct_hash_join.cuh b/cpp/include/cudf/detail/join/distinct_hash_join.cuh
index ecd2324d95c..3d0b1852e89 100644
--- a/cpp/include/cudf/detail/join/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/join/distinct_hash_join.cuh
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 #pragma once
@@ -25,14 +25,14 @@ using cudf::detail::row::lhs_index_type;
 using cudf::detail::row::rhs_index_type;
 
 /**
- * @brief A custom comparator used for the build table insertion
+ * @brief A custom comparator used for the right table insertion
  */
 struct always_not_equal {
   __device__ constexpr bool operator()(
     cuco::pair<hash_value_type, rhs_index_type> const&,
     cuco::pair<hash_value_type, rhs_index_type> const&) const noexcept
   {
-    // All build table keys are distinct thus `false` no matter what
+    // All right table keys are distinct thus `false` no matter what
     return false;
   }
 };
@@ -76,11 +76,11 @@ struct primitive_comparator_adapter {
 };
 
 /**
- * @brief Distinct hash join that builds hash table in creation and probes results in subsequent
- * `*_join` member functions.
+ * @brief Distinct hash join that builds a hash table with the right table on construction and
+ * probes results in subsequent `*_join` member functions.
  *
- * This class enables the distinct hash join scheme that builds hash table once, and probes as many
- * times as needed (possibly in parallel).
+ * This class enables the distinct hash join scheme that builds with the right table once and
+ * probes with many left tables (possibly in parallel).
  */
 class distinct_hash_join {
  public:
@@ -104,15 +104,15 @@ class distinct_hash_join {
   };
 
   /**
-   * @brief Constructor that internally builds the hash table based on the given `build` table.
+   * @brief Constructor that internally builds the hash table from the given `right` table.
    *
-   * @throw cudf::logic_error if the number of columns in `build` table is 0.
+   * @throw cudf::logic_error if the number of columns in `right` table is 0.
    *
-   * @param build The build table, from which the hash table is built
+   * @param right The right table, from which the hash table is built
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  distinct_hash_join(cudf::table_view const& build,
+  distinct_hash_join(cudf::table_view const& right,
                      cudf::null_equality compare_nulls,
                      rmm::cuda_stream_view stream);
 
@@ -121,7 +121,7 @@ class distinct_hash_join {
    *
    * @param load_factor The hash table occupancy ratio in (0,1]. A value of 0.5 means 50% occupancy.
    */
-  distinct_hash_join(cudf::table_view const& build,
+  distinct_hash_join(cudf::table_view const& right,
                      cudf::null_equality compare_nulls,
                      double load_factor,
                      rmm::cuda_stream_view stream);
@@ -131,7 +131,7 @@ class distinct_hash_join {
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(cudf::table_view const& probe,
+  inner_join(cudf::table_view const& left,
              rmm::cuda_stream_view stream,
              rmm::device_async_resource_ref mr) const;
 
@@ -139,7 +139,7 @@ class distinct_hash_join {
    * @copydoc cudf::distinct_hash_join::left_join
    */
   std::unique_ptr<rmm::device_uvector<size_type>> left_join(
-    cudf::table_view const& probe,
+    cudf::table_view const& left,
     rmm::cuda_stream_view stream,
     rmm::device_async_resource_ref mr) const;
 
@@ -156,11 +156,11 @@ class distinct_hash_join {
                                            rmm::mr::polymorphic_allocator<char>,
                                            cuco_storage_type>;
 
-  bool _has_nested_columns;  ///< True if nested columns are present in build and probe tables
+  bool _has_nested_columns;  ///< True if nested columns are present in right and left tables
   cudf::null_equality _nulls_equal;  ///< Whether to consider nulls as equal
-  cudf::table_view _build;           ///< Input table to build the hash map
+  cudf::table_view _right;           ///< Input table to build the hash map
   std::shared_ptr<cudf::detail::row::equality::preprocessed_table>
-    _preprocessed_build;        ///< Input table preprocssed for row operators
-  hash_table_type _hash_table;  ///< Hash table built on `_build`
+    _preprocessed_right;        ///< Input table preprocssed for row operators
+  hash_table_type _hash_table;  ///< Hash table built on `_right`
 };
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/join/hash_join.hpp b/cpp/include/cudf/detail/join/hash_join.hpp
index df3d0b6c2fe..b1b96ca7218 100644
--- a/cpp/include/cudf/detail/join/hash_join.hpp
+++ b/cpp/include/cudf/detail/join/hash_join.hpp
@@ -29,8 +29,8 @@ class preprocessed_table;
 namespace cudf {
 namespace detail {
 /**
- * @brief Hash join that builds hash table in creation and probes results in subsequent `*_join`
- * member functions.
+ * @brief Hash join that builds a hash table with the right table on construction and probes
+ * results in subsequent `*_join` member functions.
  *
  * User-defined hash function can be passed via the template parameter `Hasher`
  *
@@ -50,17 +50,17 @@ class hash_join {
   hash_join& operator=(hash_join&&)      = delete;
 
   /**
-   * @brief Constructor that internally builds the hash table based on the given `build` table.
+   * @brief Constructor that internally builds the hash table from the given `right` table.
    *
-   * @throw cudf::logic_error if the number of columns in `build` table is 0.
+   * @throw cudf::logic_error if the number of columns in `right` table is 0.
    *
-   * @param build The build table, from which the hash table is built.
-   * @param has_nulls Flag to indicate if the there exists any nulls in the `build` table or
-   *        any `probe` table that will be used later for join.
+   * @param right The right table, from which the hash table is built.
+   * @param has_nulls Flag to indicate if the there exists any nulls in the `right` table or
+   *        any `left` table that will be used later for join.
    * @param compare_nulls Controls whether null join-key values should match or not.
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  hash_join(cudf::table_view const& build,
+  hash_join(cudf::table_view const& right,
             bool has_nulls,
             cudf::null_equality compare_nulls,
             rmm::cuda_stream_view stream);
@@ -70,7 +70,7 @@ class hash_join {
    *
    * @param load_factor The hash table occupancy ratio in (0,1]. A value of 0.5 means 50% occupancy.
    */
-  hash_join(cudf::table_view const& build,
+  hash_join(cudf::table_view const& right,
             bool has_nulls,
             cudf::null_equality compare_nulls,
             double load_factor,
@@ -81,7 +81,7 @@ class hash_join {
    */
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(cudf::table_view const& probe,
+  inner_join(cudf::table_view const& left,
              std::optional<std::size_t> output_size,
              rmm::cuda_stream_view stream,
              rmm::device_async_resource_ref mr) const;
@@ -91,7 +91,7 @@ class hash_join {
    */
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
-  left_join(cudf::table_view const& probe,
+  left_join(cudf::table_view const& left,
             std::optional<std::size_t> output_size,
             rmm::cuda_stream_view stream,
             rmm::device_async_resource_ref mr) const;
@@ -101,7 +101,7 @@ class hash_join {
    */
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
-  full_join(cudf::table_view const& probe,
+  full_join(cudf::table_view const& left,
             std::optional<std::size_t> output_size,
             rmm::cuda_stream_view stream,
             rmm::device_async_resource_ref mr) const;
@@ -109,19 +109,19 @@ class hash_join {
   /**
    * @copydoc cudf::hash_join::inner_join_size
    */
-  [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& probe,
+  [[nodiscard]] std::size_t inner_join_size(cudf::table_view const& left,
                                             rmm::cuda_stream_view stream) const;
 
   /**
    * @copydoc cudf::hash_join::left_join_size
    */
-  [[nodiscard]] std::size_t left_join_size(cudf::table_view const& probe,
+  [[nodiscard]] std::size_t left_join_size(cudf::table_view const& left,
                                            rmm::cuda_stream_view stream) const;
 
   /**
    * @copydoc cudf::hash_join::full_join_size
    */
-  [[nodiscard]] std::size_t full_join_size(cudf::table_view const& probe,
+  [[nodiscard]] std::size_t full_join_size(cudf::table_view const& left,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr) const;
 
@@ -129,7 +129,7 @@ class hash_join {
    * @copydoc cudf::hash_join::inner_join_match_context
    */
   [[nodiscard]] cudf::join_match_context inner_join_match_context(
-    cudf::table_view const& probe,
+    cudf::table_view const& left,
     rmm::cuda_stream_view stream,
     rmm::device_async_resource_ref mr) const;
 
@@ -137,7 +137,7 @@ class hash_join {
    * @copydoc cudf::hash_join::left_join_match_context
    */
   [[nodiscard]] cudf::join_match_context left_join_match_context(
-    cudf::table_view const& probe,
+    cudf::table_view const& left,
     rmm::cuda_stream_view stream,
     rmm::device_async_resource_ref mr) const;
 
@@ -145,39 +145,39 @@ class hash_join {
    * @copydoc cudf::hash_join::full_join_match_context
    */
   [[nodiscard]] cudf::join_match_context full_join_match_context(
-    cudf::table_view const& probe,
+    cudf::table_view const& left,
     rmm::cuda_stream_view stream,
     rmm::device_async_resource_ref mr) const;
 
  private:
   bool const _is_empty;   ///< true if `_hash_table` is empty
-  bool const _has_nulls;  ///< true if nulls are present in either build table or any probe table
+  bool const _has_nulls;  ///< true if nulls are present in either right table or any left table
   cudf::null_equality const _nulls_equal;  ///< whether to consider nulls as equal
-  cudf::table_view _build;                 ///< input table to build the hash map
+  cudf::table_view _right;                 ///< input table to build the hash map
   std::shared_ptr<cudf::detail::row::equality::preprocessed_table>
-    _preprocessed_build;        ///< input table preprocssed for row operators
+    _preprocessed_right;        ///< input table preprocssed for row operators
   std::unique_ptr<impl> _impl;  ///< CUDA hash table implementation
 
   [[nodiscard]] std::unique_ptr<rmm::device_uvector<size_type>> make_match_counts(
     join_kind join,
-    cudf::table_view const& probe,
+    cudf::table_view const& left,
     rmm::cuda_stream_view stream,
     rmm::device_async_resource_ref mr) const;
 
   template <join_kind Join>
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
-  join_retrieve(cudf::table_view const& probe,
+  join_retrieve(cudf::table_view const& left,
                 std::optional<std::size_t> output_size,
                 rmm::cuda_stream_view stream,
                 rmm::device_async_resource_ref mr) const;
 
   template <join_kind Join>
-  [[nodiscard]] std::size_t join_size(cudf::table_view const& probe,
+  [[nodiscard]] std::size_t join_size(cudf::table_view const& left,
                                       rmm::cuda_stream_view stream) const;
 
   template <join_kind Join>
-  [[nodiscard]] std::size_t join_size(cudf::table_view const& probe,
+  [[nodiscard]] std::size_t join_size(cudf::table_view const& left,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr) const;
 };
diff --git a/cpp/include/cudf/join/distinct_hash_join.hpp b/cpp/include/cudf/join/distinct_hash_join.hpp
index da1338f4351..4d1b686978a 100644
--- a/cpp/include/cudf/join/distinct_hash_join.hpp
+++ b/cpp/include/cudf/join/distinct_hash_join.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -33,13 +33,13 @@ class distinct_hash_join;
 }  // namespace detail
 
 /**
- * @brief Distinct hash join that builds hash table in creation and probes results in subsequent
- * `*_join` member functions
+ * @brief Distinct hash join that builds a hash table with the right table on construction and
+ * probes results in subsequent `*_join` member functions
  *
- * This class enables the distinct hash join scheme that builds hash table once, and probes as many
- * times as needed (possibly in parallel).
+ * This class enables the distinct hash join scheme that builds with the right table once and
+ * probes with many left tables (possibly in parallel).
  *
- * @note Behavior is undefined if the build table contains duplicates.
+ * @note Behavior is undefined if the right table contains duplicates.
  * @note All NaNs are considered as equal
  */
 class distinct_hash_join {
@@ -54,17 +54,17 @@ class distinct_hash_join {
   /**
    * @brief Constructs a distinct hash join object for subsequent probe calls
    *
-   * @throw cudf::logic_error if the build table has no columns
+   * @throw cudf::logic_error if the right table has no columns
    * @throw std::invalid_argument if load_factor is not greater than 0 and less than or equal to 1
    *
-   * @param build The build table that contains distinct elements
+   * @param right The right table that contains distinct elements
    * @param compare_nulls Controls whether null join-key values should match or not
    * @param load_factor The desired ratio of filled slots to total slots in the hash table, must be
    * in range (0,1]. For example, 0.5 indicates a target of 50% occupancy. Note that the actual
    * occupancy achieved may be slightly lower than the specified value.
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
-  distinct_hash_join(cudf::table_view const& build,
+  distinct_hash_join(cudf::table_view const& right,
                      null_equality compare_nulls  = null_equality::EQUAL,
                      double load_factor           = 0.5,
                      rmm::cuda_stream_view stream = cudf::get_default_stream());
@@ -73,39 +73,39 @@ class distinct_hash_join {
    * @brief Returns the row indices that can be used to construct the result of performing
    * an inner join between two tables. @see cudf::inner_join().
    *
-   * @param probe The probe table, from which the keys are probed
+   * @param left The left table, from which the keys are probed
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned indices' device memory.
    *
-   * @return A pair of columns [`probe_indices`, `build_indices`] that can be used to
+   * @return A pair of columns [`left_indices`, `right_indices`] that can be used to
    * construct the result of performing an inner join between two tables
-   * with `build` and `probe` as the join keys.
+   * with `left` and `right` as the join keys.
    */
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(cudf::table_view const& probe,
+  inner_join(cudf::table_view const& left,
              rmm::cuda_stream_view stream      = cudf::get_default_stream(),
              rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
-   * @brief Returns the build table indices that can be used to construct the result of performing
+   * @brief Returns the right table indices that can be used to construct the result of performing
    * a left join between two tables.
    *
-   * @note For a given row index `i` of the probe table, the resulting `build_indices[i]` contains
-   * the row index of the matched row from the build table if there is a match. Otherwise, contains
+   * @note For a given row index `i` of the left table, the resulting `right_indices[i]` contains
+   * the row index of the matched row from the right table if there is a match. Otherwise, contains
    * `JoinNoMatch`.
    *
-   * @param probe The probe table, from which the keys are probed
+   * @param left The left table, from which the keys are probed
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
    *
-   * @return A `build_indices` column that can be used to construct the result of
-   * performing a left join between two tables with `build` and `probe` as the join
+   * @return A `right_indices` column that can be used to construct the result of
+   * performing a left join between two tables with `left` and `right` as the join
    * keys.
    */
   [[nodiscard]] std::unique_ptr<rmm::device_uvector<size_type>> left_join(
-    cudf::table_view const& probe,
+    cudf::table_view const& left,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
diff --git a/cpp/include/cudf/join/hash_join.hpp b/cpp/include/cudf/join/hash_join.hpp
index ace38dcb3f3..0865fb784cb 100644
--- a/cpp/include/cudf/join/hash_join.hpp
+++ b/cpp/include/cudf/join/hash_join.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -45,8 +45,8 @@ class hash_join;
 }  // namespace detail
 
 /**
- * @brief The enum class to specify if any of the input join tables (`build` table and any later
- * `probe` table) has nulls.
+ * @brief The enum class to specify if any of the input join tables (`right` table and any later
+ * `left` table) has nulls.
  *
  * This is used upon hash_join object construction to specify the existence of nulls in all the
  * possible input tables. If such null existence is unknown, `YES` should be used as the default
@@ -55,11 +55,11 @@ class hash_join;
 enum class nullable_join : bool { YES, NO };
 
 /**
- * @brief Hash join that builds hash table in creation and probes results in subsequent `*_join`
- * member functions.
+ * @brief Hash join that builds a hash table with the right table on construction and probes
+ * results in subsequent `*_join` member functions.
  *
- * This class enables the hash join scheme that builds hash table once, and probes as many times as
- * needed (possibly in parallel).
+ * This class enables the hash join scheme that builds with the right table once and probes
+ * with many left tables (possibly in parallel).
  */
 class hash_join {
  public:
@@ -76,16 +76,16 @@ class hash_join {
   /**
    * @brief Construct a hash join object for subsequent probe calls.
    *
-   * @note The `hash_join` object must not outlive the table viewed by `build`, else behavior is
+   * @note The `hash_join` object must not outlive the table viewed by `right`, else behavior is
    * undefined.
    *
-   * @throws std::invalid_argument if the build table has no columns
+   * @throws std::invalid_argument if the right table has no columns
    *
-   * @param build The build table, from which the hash table is built
+   * @param right The right table, from which the hash table is built
    * @param compare_nulls Controls whether null join-key values should match or not
    * @param stream CUDA stream used for device memory operations and kernel launches
    */
-  hash_join(cudf::table_view const& build,
+  hash_join(cudf::table_view const& right,
             null_equality compare_nulls,
             rmm::cuda_stream_view stream = cudf::get_default_stream());
 
@@ -94,12 +94,12 @@ class hash_join {
    *
    * @throws std::invalid_argument if load_factor is not greater than 0 and less than or equal to 1
    *
-   * @param has_nulls Flag to indicate if there exists any nulls in the `build` table or
-   *                  any `probe` table that will be used later for join
+   * @param has_nulls Flag to indicate if there exists any nulls in the `right` table or
+   *                  any `left` table that will be used later for join
    * @param load_factor The hash table occupancy ratio in (0,1]. A value of 0.5 means 50% desired
    * occupancy.
    */
-  hash_join(cudf::table_view const& build,
+  hash_join(cudf::table_view const& right,
             nullable_join has_nulls,
             null_equality compare_nulls,
             double load_factor,
@@ -110,22 +110,22 @@ class hash_join {
    * an inner join between two tables. @see cudf::inner_join(). Behavior is undefined if the
    * provided `output_size` is smaller than the actual output size.
    *
-   * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was
+   * @throw std::invalid_argument If the input left table has nulls while this hash_join object was
    * not constructed with null check.
    *
-   * @param probe The probe table, from which the tuples are probed
+   * @param left The left table, from which the tuples are probed
    * @param output_size Optional value which allows users to specify the exact output size
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
    *
    * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
-   * the result of performing an inner join between two tables with `build` and `probe`
+   * the result of performing an inner join between two tables with `left` and `right`
    * as the join keys .
    */
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(cudf::table_view const& probe,
+  inner_join(cudf::table_view const& left,
              std::optional<std::size_t> output_size = {},
              rmm::cuda_stream_view stream           = cudf::get_default_stream(),
              rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
@@ -135,22 +135,22 @@ class hash_join {
    * a left join between two tables. @see cudf::left_join(). Behavior is undefined if the
    * provided `output_size` is smaller than the actual output size.
    *
-   * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was
+   * @throw std::invalid_argument If the input left table has nulls while this hash_join object was
    * not constructed with null check.
    *
-   * @param probe The probe table, from which the tuples are probed
+   * @param left The left table, from which the tuples are probed
    * @param output_size Optional value which allows users to specify the exact output size
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
    *
    * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
-   * the result of performing a left join between two tables with `build` and `probe`
+   * the result of performing a left join between two tables with `left` and `right`
    * as the join keys.
    */
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
-  left_join(cudf::table_view const& probe,
+  left_join(cudf::table_view const& left,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
             rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref()) const;
@@ -160,151 +160,151 @@ class hash_join {
    * a full join between two tables. @see cudf::full_join(). Behavior is undefined if the
    * provided `output_size` is smaller than the actual output size.
    *
-   * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was
+   * @throw std::invalid_argument If the input left table has nulls while this hash_join object was
    * not constructed with null check.
    *
-   * @param probe The probe table, from which the tuples are probed
+   * @param left The left table, from which the tuples are probed
    * @param output_size Optional value which allows users to specify the exact output size
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
    *
    * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
-   * the result of performing a full join between two tables with `build` and `probe`
+   * the result of performing a full join between two tables with `left` and `right`
    * as the join keys .
    */
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
-  full_join(cudf::table_view const& probe,
+  full_join(cudf::table_view const& left,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
             rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref()) const;
 
   /**
    * Returns the exact number of matches (rows) when performing an inner join with the specified
-   * probe table.
+   * left table.
    *
-   * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was
+   * @throw std::invalid_argument If the input left table has nulls while this hash_join object was
    * not constructed with null check.
    *
-   * @param probe The probe table, from which the tuples are probed
+   * @param left The left table, from which the tuples are probed
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
    * @return The exact number of output when performing an inner join between two tables with
-   * `build` and `probe` as the join keys .
+   * `left` and `right` as the join keys .
    */
   [[nodiscard]] std::size_t inner_join_size(
-    cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
+    cudf::table_view const& left, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
-   * Returns the exact number of matches (rows) when performing a left join with the specified probe
+   * Returns the exact number of matches (rows) when performing a left join with the specified left
    * table.
    *
-   * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was
+   * @throw std::invalid_argument If the input left table has nulls while this hash_join object was
    * not constructed with null check.
    *
-   * @param probe The probe table, from which the tuples are probed
+   * @param left The left table, from which the tuples are probed
    * @param stream CUDA stream used for device memory operations and kernel launches
    *
-   * @return The exact number of output when performing a left join between two tables with `build`
-   * and `probe` as the join keys .
+   * @return The exact number of output when performing a left join between two tables with `left`
+   * and `right` as the join keys .
    */
   [[nodiscard]] std::size_t left_join_size(
-    cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
+    cudf::table_view const& left, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
-   * Returns the exact number of matches (rows) when performing a full join with the specified probe
+   * Returns the exact number of matches (rows) when performing a full join with the specified left
    * table.
    *
-   * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was
+   * @throw std::invalid_argument If the input left table has nulls while this hash_join object was
    * not constructed with null check.
    *
-   * @param probe The probe table, from which the tuples are probed
+   * @param left The left table, from which the tuples are probed
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the intermediate table and columns' device
    * memory.
    *
-   * @return The exact number of output when performing a full join between two tables with `build`
-   * and `probe` as the join keys .
+   * @return The exact number of output when performing a full join between two tables with `left`
+   * and `right` as the join keys .
    */
   [[nodiscard]] std::size_t full_join_size(
-    cudf::table_view const& probe,
+    cudf::table_view const& left,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
-   * @brief Returns context information about matches between the probe and build tables.
+   * @brief Returns context information about matches between the left and right tables.
    *
-   * This method computes, for each row in the probe table, how many matching rows exist in
-   * the build table according to inner join semantics, and returns the number of matches through a
+   * This method computes, for each row in the left table, how many matching rows exist in
+   * the right table according to inner join semantics, and returns the number of matches through a
    * join_match_context object.
    *
    * This is particularly useful for:
    * - Determining the total size of a potential join result without materializing it
    * - Planning partitioned join operations for large datasets
    *
-   * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was
+   * @throw std::invalid_argument If the input left table has nulls while this hash_join object was
    * not constructed with null check.
    *
-   * @param probe The probe table to join with the pre-processed build table
+   * @param left The left table to join with the pre-processed right table
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the result device memory
    *
-   * @return A join_match_context object containing the probe table view and a device vector
-   *         of match counts for each row in the probe table
+   * @return A join_match_context object containing the left table view and a device vector
+   *         of match counts for each row in the left table
    */
   [[nodiscard]] cudf::join_match_context inner_join_match_context(
-    cudf::table_view const& probe,
+    cudf::table_view const& left,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
-   * @brief Returns context information about matches between the probe and build tables.
+   * @brief Returns context information about matches between the left and right tables.
    *
-   * This method computes, for each row in the probe table, how many matching rows exist in
-   * the build table according to left join semantics, and returns the number of matches through a
+   * This method computes, for each row in the left table, how many matching rows exist in
+   * the right table according to left join semantics, and returns the number of matches through a
    * join_match_context object.
    *
-   * For left join, every row in the probe table will have at least one match (either with a
-   * matching row from the build table or with a null placeholder).
+   * For left join, every row in the left table will have at least one match (either with a
+   * matching row from the right table or with a null placeholder).
    *
-   * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was
+   * @throw std::invalid_argument If the input left table has nulls while this hash_join object was
    * not constructed with null check.
    *
-   * @param probe The probe table to join with the pre-processed build table
+   * @param left The left table to join with the pre-processed right table
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the result device memory
    *
-   * @return A join_match_context object containing the probe table view and a device vector
-   *         of match counts for each row in the probe table
+   * @return A join_match_context object containing the left table view and a device vector
+   *         of match counts for each row in the left table
    */
   [[nodiscard]] cudf::join_match_context left_join_match_context(
-    cudf::table_view const& probe,
+    cudf::table_view const& left,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
-   * @brief Returns context information about matches between the probe and build tables.
+   * @brief Returns context information about matches between the left and right tables.
    *
-   * This method computes, for each row in the probe table, how many matching rows exist in
-   * the build table according to full join semantics, and returns the number of matches through a
+   * This method computes, for each row in the left table, how many matching rows exist in
+   * the right table according to full join semantics, and returns the number of matches through a
    * join_match_context object.
    *
-   * For full join, this includes matches for probe table rows, and the result may need to be
-   * combined with unmatched rows from the build table to get the complete picture.
+   * For full join, this includes matches for left table rows, and the result may need to be
+   * combined with unmatched rows from the right table to get the complete picture.
    *
-   * @throw std::invalid_argument If the input probe table has nulls while this hash_join object was
+   * @throw std::invalid_argument If the input left table has nulls while this hash_join object was
    * not constructed with null check.
    *
-   * @param probe The probe table to join with the pre-processed build table
+   * @param left The left table to join with the pre-processed right table
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the result device memory
    *
-   * @return A join_match_context object containing the probe table view and a device vector
-   *         of match counts for each row in the probe table
+   * @return A join_match_context object containing the left table view and a device vector
+   *         of match counts for each row in the left table
    */
   [[nodiscard]] cudf::join_match_context full_join_match_context(
-    cudf::table_view const& probe,
+    cudf::table_view const& left,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index 36b057d33a4..718f96bf29d 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -103,7 +103,7 @@ struct output_fn {
  * @param hash_table The hash table to search in
  * @param iter Iterator over hash values
  * @param d_equal Equality comparator
- * @param probe The probe table
+ * @param left The left table
  * @param hasher Hash function
  * @param nulls_equal Null equality setting
  * @param found_begin Output iterator for found indices
@@ -117,27 +117,27 @@ template <typename HashTableType,
 void find_matches_in_hash_table(HashTableType const& hash_table,
                                 IterType iter,
                                 EqualType const& d_equal,
-                                cudf::table_view const& probe,
+                                cudf::table_view const& left,
                                 Hasher hasher,
                                 cudf::null_equality nulls_equal,
                                 FoundIterator found_begin,
                                 rmm::cuda_stream_view stream)
 {
-  auto const probe_table_num_rows = probe.num_rows();
-  // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not
+  auto const left_table_num_rows = left.num_rows();
+  // If `idx` is within the range `[0, left_table_num_rows)` and `found_indices[idx]` is not
   // equal to `cudf::JoinNoMatch`, then `idx` has a match in the hash set.
-  if (nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(probe))) {
+  if (nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(left))) {
     hash_table.find_async(
-      iter, iter + probe_table_num_rows, d_equal, hasher, found_begin, stream.value());
+      iter, iter + left_table_num_rows, d_equal, hasher, found_begin, stream.value());
   } else {
     auto stencil = cuda::counting_iterator<size_type>{0};
     auto const row_bitmask =
-      cudf::detail::bitmask_and(probe, stream, cudf::get_current_device_resource_ref()).first;
+      cudf::detail::bitmask_and(left, stream, cudf::get_current_device_resource_ref()).first;
     auto const pred =
       cudf::detail::row_is_valid{reinterpret_cast<bitmask_type const*>(row_bitmask.data())};
 
     hash_table.find_if_async(iter,
-                             iter + probe_table_num_rows,
+                             iter + left_table_num_rows,
                              stencil,
                              pred,
                              d_equal,
@@ -149,22 +149,22 @@ void find_matches_in_hash_table(HashTableType const& hash_table,
 
 }  // namespace
 
-distinct_hash_join::distinct_hash_join(cudf::table_view const& build,
+distinct_hash_join::distinct_hash_join(cudf::table_view const& right,
                                        cudf::null_equality compare_nulls,
                                        rmm::cuda_stream_view stream)
-  : distinct_hash_join{build, compare_nulls, CUCO_DESIRED_LOAD_FACTOR, stream}
+  : distinct_hash_join{right, compare_nulls, CUCO_DESIRED_LOAD_FACTOR, stream}
 {
 }
 
-distinct_hash_join::distinct_hash_join(cudf::table_view const& build,
+distinct_hash_join::distinct_hash_join(cudf::table_view const& right,
                                        cudf::null_equality compare_nulls,
                                        double load_factor,
                                        rmm::cuda_stream_view stream)
-  : _has_nested_columns{cudf::has_nested_columns(build)},
+  : _has_nested_columns{cudf::has_nested_columns(right)},
     _nulls_equal{compare_nulls},
-    _build{build},
-    _preprocessed_build{cudf::detail::row::equality::preprocessed_table::create(_build, stream)},
-    _hash_table{cuco::extent{static_cast<std::size_t>(build.num_rows())},
+    _right{right},
+    _preprocessed_right{cudf::detail::row::equality::preprocessed_table::create(_right, stream)},
+    _hash_table{cuco::extent{static_cast<std::size_t>(right.num_rows())},
                 load_factor,
                 cuco::empty_key{cuco::pair{std::numeric_limits<hash_value_type>::max(),
                                            rhs_index_type{cudf::JoinNoMatch}}},
@@ -176,41 +176,41 @@ distinct_hash_join::distinct_hash_join(cudf::table_view const& build,
                 stream.value()}
 {
   CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(0 != this->_build.num_columns(), "Hash join build table is empty");
+  CUDF_EXPECTS(0 != this->_right.num_columns(), "Hash join right table is empty");
   CUDF_EXPECTS(load_factor > 0 && load_factor <= 1,
                "Invalid load factor: must be greater than 0 and less than or equal to 1.",
                std::invalid_argument);
 
-  size_type const build_table_num_rows{_build.num_rows()};
+  size_type const right_table_num_rows{_right.num_rows()};
 
-  if (build_table_num_rows == 0) { return; }
+  if (right_table_num_rows == 0) { return; }
 
   auto const build_hash_table = [&](auto iter) {
-    if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(build))) {
-      this->_hash_table.insert_async(iter, iter + build_table_num_rows, stream.value());
+    if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(right))) {
+      this->_hash_table.insert_async(iter, iter + right_table_num_rows, stream.value());
     } else {
       auto stencil = cuda::counting_iterator<size_type>{0};
       auto const row_bitmask =
-        cudf::detail::bitmask_and(_build, stream, cudf::get_current_device_resource_ref()).first;
+        cudf::detail::bitmask_and(_right, stream, cudf::get_current_device_resource_ref()).first;
       auto const pred =
         cudf::detail::row_is_valid{reinterpret_cast<bitmask_type const*>(row_bitmask.data())};
 
       // insert valid rows
       this->_hash_table.insert_if_async(
-        iter, iter + build_table_num_rows, stencil, pred, stream.value());
+        iter, iter + right_table_num_rows, stencil, pred, stream.value());
     }
   };
 
-  if (cudf::detail::is_primitive_row_op_compatible(_build)) {
+  if (cudf::detail::is_primitive_row_op_compatible(_right)) {
     auto const d_hasher = cudf::detail::row::primitive::row_hasher{nullate::DYNAMIC{has_nulls},
-                                                                   this->_preprocessed_build};
+                                                                   this->_preprocessed_right};
 
     auto const iter = cudf::detail::make_counting_transform_iterator(
       0, primitive_keys_fn<rhs_index_type>{d_hasher});
 
     build_hash_table(iter);
   } else {
-    auto const row_hasher = detail::row::hash::row_hasher{this->_preprocessed_build};
+    auto const row_hasher = detail::row::hash::row_hasher{this->_preprocessed_right};
     auto const d_hasher   = row_hasher.device_hasher(nullate::DYNAMIC{has_nulls});
 
     auto const iter =
@@ -222,54 +222,54 @@ distinct_hash_join::distinct_hash_join(cudf::table_view const& build,
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-distinct_hash_join::inner_join(cudf::table_view const& probe,
+distinct_hash_join::inner_join(cudf::table_view const& left,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::inner_join"};
 
-  size_type const probe_table_num_rows{probe.num_rows()};
+  size_type const left_table_num_rows{left.num_rows()};
 
   // If output size is zero, return immediately
-  if (probe_table_num_rows == 0) {
+  if (left_table_num_rows == 0) {
     return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  auto build_indices =
-    std::make_unique<rmm::device_uvector<size_type>>(probe_table_num_rows, stream, mr);
-  auto probe_indices =
-    std::make_unique<rmm::device_uvector<size_type>>(probe_table_num_rows, stream, mr);
+  auto right_indices =
+    std::make_unique<rmm::device_uvector<size_type>>(left_table_num_rows, stream, mr);
+  auto left_indices =
+    std::make_unique<rmm::device_uvector<size_type>>(left_table_num_rows, stream, mr);
 
-  auto found_indices = rmm::device_uvector<size_type>(probe_table_num_rows, stream);
+  auto found_indices = rmm::device_uvector<size_type>(left_table_num_rows, stream);
   auto const found_begin =
     thrust::make_transform_output_iterator(found_indices.begin(), output_fn{});
 
-  auto preprocessed_probe = cudf::detail::row::equality::preprocessed_table::create(probe, stream);
-  if (cudf::detail::is_primitive_row_op_compatible(_build)) {
+  auto preprocessed_left = cudf::detail::row::equality::preprocessed_table::create(left, stream);
+  if (cudf::detail::is_primitive_row_op_compatible(_right)) {
     auto const d_hasher =
-      cudf::detail::row::primitive::row_hasher{nullate::DYNAMIC{has_nulls}, preprocessed_probe};
+      cudf::detail::row::primitive::row_hasher{nullate::DYNAMIC{has_nulls}, preprocessed_left};
     auto const d_equal = cudf::detail::row::primitive::row_equality_comparator{
-      nullate::DYNAMIC{has_nulls}, preprocessed_probe, _preprocessed_build, _nulls_equal};
+      nullate::DYNAMIC{has_nulls}, preprocessed_left, _preprocessed_right, _nulls_equal};
     auto const iter = cudf::detail::make_counting_transform_iterator(
       0, primitive_keys_fn<lhs_index_type>{d_hasher});
 
     find_matches_in_hash_table(this->_hash_table,
                                iter,
                                primitive_comparator_adapter{d_equal},
-                               probe,
+                               left,
                                hasher{},
                                _nulls_equal,
                                found_begin,
                                stream);
   } else {
     auto const two_table_equal =
-      cudf::detail::row::equality::two_table_comparator(preprocessed_probe, _preprocessed_build);
+      cudf::detail::row::equality::two_table_comparator(preprocessed_left, _preprocessed_right);
 
-    auto const probe_row_hasher = cudf::detail::row::hash::row_hasher{preprocessed_probe};
-    auto const d_probe_hasher   = probe_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls});
-    auto const iter             = cudf::detail::make_counting_transform_iterator(
-      0, build_keys_fn<lhs_index_type>{d_probe_hasher});
+    auto const left_row_hasher = cudf::detail::row::hash::row_hasher{preprocessed_left};
+    auto const d_left_hasher   = left_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls});
+    auto const iter            = cudf::detail::make_counting_transform_iterator(
+      0, build_keys_fn<lhs_index_type>{d_left_hasher});
 
     if (_has_nested_columns) {
       auto const device_comparator =
@@ -277,7 +277,7 @@ distinct_hash_join::inner_join(cudf::table_view const& probe,
       find_matches_in_hash_table(this->_hash_table,
                                  iter,
                                  comparator_adapter{device_comparator},
-                                 probe,
+                                 left,
                                  hasher{},
                                  _nulls_equal,
                                  found_begin,
@@ -288,7 +288,7 @@ distinct_hash_join::inner_join(cudf::table_view const& probe,
       find_matches_in_hash_table(this->_hash_table,
                                  iter,
                                  comparator_adapter{device_comparator},
-                                 probe,
+                                 left,
                                  hasher{},
                                  _nulls_equal,
                                  found_begin,
@@ -303,10 +303,10 @@ distinct_hash_join::inner_join(cudf::table_view const& probe,
         return cuda::std::tuple{*(found_iter + idx), idx};
       }));
   auto const output_begin =
-    thrust::make_zip_iterator(build_indices->begin(), probe_indices->begin());
+    thrust::make_zip_iterator(right_indices->begin(), left_indices->begin());
   auto const output_end =
     cudf::detail::copy_if(tuple_iter,
-                          tuple_iter + probe_table_num_rows,
+                          tuple_iter + left_table_num_rows,
                           found_indices.begin(),
                           output_begin,
                           cuda::proclaim_return_type<bool>(
@@ -314,38 +314,38 @@ distinct_hash_join::inner_join(cudf::table_view const& probe,
                           stream);
   auto const actual_size = std::distance(output_begin, output_end);
 
-  build_indices->resize(actual_size, stream);
-  probe_indices->resize(actual_size, stream);
+  right_indices->resize(actual_size, stream);
+  left_indices->resize(actual_size, stream);
 
-  return {std::move(probe_indices), std::move(build_indices)};
+  return {std::move(left_indices), std::move(right_indices)};
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join::left_join(
-  cudf::table_view const& probe,
+  cudf::table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::left_join"};
 
-  size_type const probe_table_num_rows{probe.num_rows()};
+  size_type const left_table_num_rows{left.num_rows()};
 
   // If output size is zero, return empty
-  if (probe_table_num_rows == 0) {
+  if (left_table_num_rows == 0) {
     return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
   }
 
-  auto build_indices =
-    std::make_unique<rmm::device_uvector<size_type>>(probe_table_num_rows, stream, mr);
+  auto right_indices =
+    std::make_unique<rmm::device_uvector<size_type>>(left_table_num_rows, stream, mr);
   auto const output_begin =
-    thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
+    thrust::make_transform_output_iterator(right_indices->begin(), output_fn{});
 
-  auto preprocessed_probe = cudf::detail::row::equality::preprocessed_table::create(probe, stream);
+  auto preprocessed_left = cudf::detail::row::equality::preprocessed_table::create(left, stream);
 
-  if (cudf::detail::is_primitive_row_op_compatible(_build)) {
+  if (cudf::detail::is_primitive_row_op_compatible(_right)) {
     auto const d_hasher =
-      cudf::detail::row::primitive::row_hasher{nullate::DYNAMIC{has_nulls}, preprocessed_probe};
+      cudf::detail::row::primitive::row_hasher{nullate::DYNAMIC{has_nulls}, preprocessed_left};
     auto const d_equal = cudf::detail::row::primitive::row_equality_comparator{
-      nullate::DYNAMIC{has_nulls}, preprocessed_probe, _preprocessed_build, _nulls_equal};
+      nullate::DYNAMIC{has_nulls}, preprocessed_left, _preprocessed_right, _nulls_equal};
 
     auto const iter = cudf::detail::make_counting_transform_iterator(
       0, primitive_keys_fn<lhs_index_type>{d_hasher});
@@ -353,26 +353,26 @@ std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join::left_join(
     find_matches_in_hash_table(this->_hash_table,
                                iter,
                                primitive_comparator_adapter{d_equal},
-                               probe,
+                               left,
                                hasher{},
                                _nulls_equal,
                                output_begin,
                                stream);
   } else {
-    // If build table is empty, return probe table
-    if (this->_build.num_rows() == 0) {
+    // If right table is empty, return left table
+    if (this->_right.num_rows() == 0) {
       thrust::fill(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()),
-                   build_indices->begin(),
-                   build_indices->end(),
+                   right_indices->begin(),
+                   right_indices->end(),
                    cudf::JoinNoMatch);
     } else {
       auto const two_table_equal =
-        cudf::detail::row::equality::two_table_comparator(preprocessed_probe, _preprocessed_build);
+        cudf::detail::row::equality::two_table_comparator(preprocessed_left, _preprocessed_right);
 
-      auto const probe_row_hasher = cudf::detail::row::hash::row_hasher{preprocessed_probe};
-      auto const d_probe_hasher   = probe_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls});
-      auto const iter             = cudf::detail::make_counting_transform_iterator(
-        0, build_keys_fn<lhs_index_type>{d_probe_hasher});
+      auto const left_row_hasher = cudf::detail::row::hash::row_hasher{preprocessed_left};
+      auto const d_left_hasher   = left_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls});
+      auto const iter            = cudf::detail::make_counting_transform_iterator(
+        0, build_keys_fn<lhs_index_type>{d_left_hasher});
 
       if (_has_nested_columns) {
         auto const device_comparator =
@@ -380,7 +380,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join::left_join(
         find_matches_in_hash_table(this->_hash_table,
                                    iter,
                                    comparator_adapter{device_comparator},
-                                   probe,
+                                   left,
                                    hasher{},
                                    _nulls_equal,
                                    output_begin,
@@ -391,7 +391,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join::left_join(
         find_matches_in_hash_table(this->_hash_table,
                                    iter,
                                    comparator_adapter{device_comparator},
-                                   probe,
+                                   left,
                                    hasher{},
                                    _nulls_equal,
                                    output_begin,
@@ -399,34 +399,34 @@ std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join::left_join(
       }
     }
   }
-  return build_indices;
+  return right_indices;
 }
 }  // namespace detail
 
 distinct_hash_join::~distinct_hash_join() = default;
 
-distinct_hash_join::distinct_hash_join(cudf::table_view const& build,
+distinct_hash_join::distinct_hash_join(cudf::table_view const& right,
                                        null_equality compare_nulls,
                                        double load_factor,
                                        rmm::cuda_stream_view stream)
-  : _impl{std::make_unique<impl_type>(build, compare_nulls, load_factor, stream)}
+  : _impl{std::make_unique<impl_type>(right, compare_nulls, load_factor, stream)}
 {
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-distinct_hash_join::inner_join(cudf::table_view const& probe,
+distinct_hash_join::inner_join(cudf::table_view const& left,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr) const
 {
-  return _impl->inner_join(probe, stream, mr);
+  return _impl->inner_join(left, stream, mr);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join::left_join(
-  cudf::table_view const& probe,
+  cudf::table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const
 {
-  return _impl->left_join(probe, stream, mr);
+  return _impl->left_join(left, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/join/hash_join/common.cuh b/cpp/src/join/hash_join/common.cuh
index f523564f20f..d80f981099e 100644
--- a/cpp/src/join/hash_join/common.cuh
+++ b/cpp/src/join/hash_join/common.cuh
@@ -24,17 +24,17 @@ using hash_table_t     = typename cudf::detail::hash_join<hash_join_hasher>::imp
 
 bool is_trivial_join(table_view const& left, table_view const& right, join_kind join_type);
 
-void validate_hash_join_probe(table_view const& build, table_view const& probe, bool has_nulls);
+void validate_hash_join_probe(table_view const& right, table_view const& left, bool has_nulls);
 
 std::unique_ptr<rmm::device_uvector<size_type>> make_join_match_counts(
-  table_view const& build,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_build,
+  table_view const& right,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_right,
   cudf::detail::hash_table_t const& hash_table,
   bool is_empty,
   bool has_nulls,
   null_equality compare_nulls,
   join_kind join,
-  table_view const& probe,
+  table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
diff --git a/cpp/src/join/hash_join/dispatch.cuh b/cpp/src/join/hash_join/dispatch.cuh
index 43f3c6edf3e..2092817e293 100644
--- a/cpp/src/join/hash_join/dispatch.cuh
+++ b/cpp/src/join/hash_join/dispatch.cuh
@@ -42,7 +42,7 @@ class pair_equal {
 };
 
 /**
- * @brief Extracts the build-side row index from a cuco hash table slot.
+ * @brief Extracts the right-side row index from a cuco hash table slot.
  */
 struct output_fn {
   __device__ constexpr cudf::size_type operator()(
@@ -75,34 +75,34 @@ class primitive_pair_equal {
 
 template <typename Fn>
 decltype(auto) dispatch_join_comparator(
-  table_view const& build_table,
-  table_view const& probe_table,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_build,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_probe,
+  table_view const& right_table,
+  table_view const& left_table,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_right,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_left,
   bool has_nulls,
   null_equality compare_nulls,
   Fn&& fn)
 {
-  auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
+  auto const left_nulls = cudf::nullate::DYNAMIC{has_nulls};
 
-  if (cudf::detail::is_primitive_row_op_compatible(build_table)) {
-    auto const d_hasher = cudf::detail::row::primitive::row_hasher{probe_nulls, preprocessed_probe};
+  if (cudf::detail::is_primitive_row_op_compatible(right_table)) {
+    auto const d_hasher = cudf::detail::row::primitive::row_hasher{left_nulls, preprocessed_left};
     auto const d_equal  = cudf::detail::row::primitive::row_equality_comparator{
-      probe_nulls, preprocessed_probe, preprocessed_build, compare_nulls};
+      left_nulls, preprocessed_left, preprocessed_right, compare_nulls};
     return std::forward<Fn>(fn)(primitive_pair_equal{d_equal}, d_hasher);
   }
 
   auto const d_hasher =
-    cudf::detail::row::hash::row_hasher{preprocessed_probe}.device_hasher(probe_nulls);
+    cudf::detail::row::hash::row_hasher{preprocessed_left}.device_hasher(left_nulls);
   auto const row_comparator =
-    cudf::detail::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
+    cudf::detail::row::equality::two_table_comparator{preprocessed_left, preprocessed_right};
 
-  if (cudf::detail::has_nested_columns(probe_table)) {
-    auto const d_equal = row_comparator.equal_to<true>(probe_nulls, compare_nulls);
+  if (cudf::detail::has_nested_columns(left_table)) {
+    auto const d_equal = row_comparator.equal_to<true>(left_nulls, compare_nulls);
     return std::forward<Fn>(fn)(pair_equal{d_equal}, d_hasher);
   }
 
-  auto const d_equal = row_comparator.equal_to<false>(probe_nulls, compare_nulls);
+  auto const d_equal = row_comparator.equal_to<false>(left_nulls, compare_nulls);
   return std::forward<Fn>(fn)(pair_equal{d_equal}, d_hasher);
 }
 
diff --git a/cpp/src/join/hash_join/full_join_match_context.cpp b/cpp/src/join/hash_join/full_join_match_context.cpp
index b2a6f063f22..f065064e7f6 100644
--- a/cpp/src/join/hash_join/full_join_match_context.cpp
+++ b/cpp/src/join/hash_join/full_join_match_context.cpp
@@ -12,17 +12,16 @@ namespace cudf::detail {
 
 template <typename Hasher>
 cudf::join_match_context hash_join<Hasher>::full_join_match_context(
-  cudf::table_view const& probe,
+  cudf::table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"hash_join::full_join_match_context"};
-  return cudf::join_match_context{probe,
-                                  make_match_counts(join_kind::FULL_JOIN, probe, stream, mr)};
+  return cudf::join_match_context{left, make_match_counts(join_kind::FULL_JOIN, left, stream, mr)};
 }
 
 template cudf::join_match_context cudf::hash_join::impl_type::full_join_match_context(
-  cudf::table_view const& probe,
+  cudf::table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const;
 
diff --git a/cpp/src/join/hash_join/full_join_retrieve.cu b/cpp/src/join/hash_join/full_join_retrieve.cu
index cf0b33e6ef7..7830b6aea6d 100644
--- a/cpp/src/join/hash_join/full_join_retrieve.cu
+++ b/cpp/src/join/hash_join/full_join_retrieve.cu
@@ -10,17 +10,17 @@ namespace cudf::detail {
 template <typename Hasher>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join<Hasher>::full_join(cudf::table_view const& probe,
+hash_join<Hasher>::full_join(cudf::table_view const& left,
                              std::optional<std::size_t> output_size,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr) const
 {
-  return this->template join_retrieve<join_kind::FULL_JOIN>(probe, output_size, stream, mr);
+  return this->template join_retrieve<join_kind::FULL_JOIN>(left, output_size, stream, mr);
 }
 
 template std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                    std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join<hash_join_hasher>::full_join(cudf::table_view const& probe,
+hash_join<hash_join_hasher>::full_join(cudf::table_view const& left,
                                        std::optional<std::size_t> output_size,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr) const;
diff --git a/cpp/src/join/hash_join/full_join_size.cu b/cpp/src/join/hash_join/full_join_size.cu
index b892289d277..b73f19b4240 100644
--- a/cpp/src/join/hash_join/full_join_size.cu
+++ b/cpp/src/join/hash_join/full_join_size.cu
@@ -8,15 +8,15 @@
 namespace cudf::detail {
 
 template <typename Hasher>
-std::size_t hash_join<Hasher>::full_join_size(cudf::table_view const& probe,
+std::size_t hash_join<Hasher>::full_join_size(cudf::table_view const& left,
                                               rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr) const
 {
-  return this->template join_size<join_kind::FULL_JOIN>(probe, stream, mr);
+  return this->template join_size<join_kind::FULL_JOIN>(left, stream, mr);
 }
 
 template std::size_t hash_join<hash_join_hasher>::full_join_size(
-  cudf::table_view const& probe,
+  cudf::table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const;
 
diff --git a/cpp/src/join/hash_join/full_join_size_impl.cu b/cpp/src/join/hash_join/full_join_size_impl.cu
index 4ce0f2b57aa..308bfa62da4 100644
--- a/cpp/src/join/hash_join/full_join_size_impl.cu
+++ b/cpp/src/join/hash_join/full_join_size_impl.cu
@@ -53,20 +53,20 @@ std::size_t compute_left_join_complement_size(cudf::device_span<size_type const>
 }  // namespace
 
 std::size_t get_full_join_size(
-  cudf::table_view const& build_table,
-  cudf::table_view const& probe_table,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_build,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_probe,
+  cudf::table_view const& right_table,
+  cudf::table_view const& left_table,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_right,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_left,
   cudf::detail::hash_table_t const& hash_table,
   bool has_nulls,
   null_equality compare_nulls,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  std::size_t join_size = compute_join_output_size<join_kind::LEFT_JOIN>(build_table,
-                                                                         probe_table,
-                                                                         preprocessed_build,
-                                                                         preprocessed_probe,
+  std::size_t join_size = compute_join_output_size<join_kind::LEFT_JOIN>(right_table,
+                                                                         left_table,
+                                                                         preprocessed_right,
+                                                                         preprocessed_left,
                                                                          hash_table,
                                                                          has_nulls,
                                                                          compare_nulls,
@@ -79,18 +79,18 @@ std::size_t get_full_join_size(
   auto const out_build_begin =
     thrust::make_transform_output_iterator(right_indices->begin(), output_fn{});
 
-  retrieve_left_join_build_indices(build_table,
-                                   probe_table,
-                                   preprocessed_build,
-                                   preprocessed_probe,
+  retrieve_left_join_build_indices(right_table,
+                                   left_table,
+                                   preprocessed_right,
+                                   preprocessed_left,
                                    hash_table,
                                    has_nulls,
                                    compare_nulls,
                                    out_build_begin,
                                    stream);
 
-  auto const left_table_row_count  = probe_table.num_rows();
-  auto const right_table_row_count = build_table.num_rows();
+  auto const left_table_row_count  = left_table.num_rows();
+  auto const right_table_row_count = right_table.num_rows();
 
   return join_size + compute_left_join_complement_size(
                        *right_indices, left_table_row_count, right_table_row_count, stream);
diff --git a/cpp/src/join/hash_join/hash_join.cu b/cpp/src/join/hash_join/hash_join.cu
index 28448cbb677..5c7679ae588 100644
--- a/cpp/src/join/hash_join/hash_join.cu
+++ b/cpp/src/join/hash_join/hash_join.cu
@@ -43,81 +43,81 @@ bool is_trivial_join(table_view const& left, table_view const& right, join_kind
   return false;
 }
 
-void validate_hash_join_probe(table_view const& build, table_view const& probe, bool has_nulls)
+void validate_hash_join_probe(table_view const& right, table_view const& left, bool has_nulls)
 {
-  CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty", std::invalid_argument);
-  CUDF_EXPECTS(build.num_columns() == probe.num_columns(),
+  CUDF_EXPECTS(0 != left.num_columns(), "Hash join left table is empty", std::invalid_argument);
+  CUDF_EXPECTS(right.num_columns() == left.num_columns(),
                "Mismatch in number of columns to be joined on",
                std::invalid_argument);
-  CUDF_EXPECTS(has_nulls || !cudf::has_nested_nulls(probe),
-               "Probe table has nulls while build table was not hashed with null check.",
+  CUDF_EXPECTS(has_nulls || !cudf::has_nested_nulls(left),
+               "Left table has nulls while right table was not hashed with null check.",
                std::invalid_argument);
-  CUDF_EXPECTS(cudf::have_same_types(build, probe),
+  CUDF_EXPECTS(cudf::have_same_types(right, left),
                "Mismatch in joining column data types",
                cudf::data_type_error);
 }
 
 namespace {
 void build_hash_join(
-  cudf::table_view const& build,
-  std::shared_ptr<detail::row::equality::preprocessed_table> const& preprocessed_build,
+  cudf::table_view const& right,
+  std::shared_ptr<detail::row::equality::preprocessed_table> const& preprocessed_right,
   cudf::detail::hash_table_t& hash_table,
   bool has_nested_nulls,
   null_equality nulls_equal,
   [[maybe_unused]] bitmask_type const* bitmask,
   rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(0 != build.num_columns(), "Selected build dataset is empty", std::invalid_argument);
-  CUDF_EXPECTS(0 != build.num_rows(), "Build side table has no rows", std::invalid_argument);
+  CUDF_EXPECTS(0 != right.num_columns(), "Selected right dataset is empty", std::invalid_argument);
+  CUDF_EXPECTS(0 != right.num_rows(), "Right side table has no rows", std::invalid_argument);
 
-  auto insert_rows = [&](auto const& build, auto const& d_hasher) {
+  auto insert_rows = [&](auto const& right, auto const& d_hasher) {
     auto const iter = cudf::detail::make_counting_transform_iterator(0, pair_fn{d_hasher});
 
-    if (nulls_equal == cudf::null_equality::EQUAL or not nullable(build)) {
-      hash_table.insert(iter, iter + build.num_rows(), stream.value());
+    if (nulls_equal == cudf::null_equality::EQUAL or not nullable(right)) {
+      hash_table.insert(iter, iter + right.num_rows(), stream.value());
     } else {
       auto const stencil = cuda::counting_iterator<size_type>{0};
       auto const pred    = row_is_valid{bitmask};
 
-      hash_table.insert_if(iter, iter + build.num_rows(), stencil, pred, stream.value());
+      hash_table.insert_if(iter, iter + right.num_rows(), stencil, pred, stream.value());
     }
   };
 
   auto const nulls = nullate::DYNAMIC{has_nested_nulls};
 
-  if (cudf::detail::is_primitive_row_op_compatible(build)) {
-    auto const d_hasher = cudf::detail::row::primitive::row_hasher{nulls, preprocessed_build};
+  if (cudf::detail::is_primitive_row_op_compatible(right)) {
+    auto const d_hasher = cudf::detail::row::primitive::row_hasher{nulls, preprocessed_right};
 
-    insert_rows(build, d_hasher);
+    insert_rows(right, d_hasher);
   } else {
-    auto const row_hash = detail::row::hash::row_hasher{preprocessed_build};
+    auto const row_hash = detail::row::hash::row_hasher{preprocessed_right};
     auto const d_hasher = row_hash.device_hasher(nulls);
 
-    insert_rows(build, d_hasher);
+    insert_rows(right, d_hasher);
   }
 }
 }  // namespace
 
 template <typename Hasher>
-hash_join<Hasher>::hash_join(cudf::table_view const& build,
+hash_join<Hasher>::hash_join(cudf::table_view const& right,
                              bool has_nulls,
                              cudf::null_equality compare_nulls,
                              rmm::cuda_stream_view stream)
-  : hash_join{build, has_nulls, compare_nulls, CUCO_DESIRED_LOAD_FACTOR, stream}
+  : hash_join{right, has_nulls, compare_nulls, CUCO_DESIRED_LOAD_FACTOR, stream}
 {
 }
 
 template <typename Hasher>
-hash_join<Hasher>::hash_join(cudf::table_view const& build,
+hash_join<Hasher>::hash_join(cudf::table_view const& right,
                              bool has_nulls,
                              cudf::null_equality compare_nulls,
                              double load_factor,
                              rmm::cuda_stream_view stream)
   : _has_nulls(has_nulls),
-    _is_empty{build.num_rows() == 0},
+    _is_empty{right.num_rows() == 0},
     _nulls_equal{compare_nulls},
     _impl{std::make_unique<impl>(impl{typename impl::hash_table_t{
-      cuco::extent{static_cast<size_t>(build.num_rows())},
+      cuco::extent{static_cast<size_t>(right.num_rows())},
       load_factor,
       cuco::empty_key{cuco::pair{std::numeric_limits<hash_value_type>::max(), cudf::JoinNoMatch}},
       {},
@@ -126,11 +126,11 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
       {},
       rmm::mr::polymorphic_allocator<char>{},
       stream.value()}})},
-    _build{build},
-    _preprocessed_build{cudf::detail::row::equality::preprocessed_table::create(_build, stream)}
+    _right{right},
+    _preprocessed_right{cudf::detail::row::equality::preprocessed_table::create(_right, stream)}
 {
   CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(0 != build.num_columns(), "Hash join build table is empty", std::invalid_argument);
+  CUDF_EXPECTS(0 != right.num_columns(), "Hash join right table is empty", std::invalid_argument);
   CUDF_EXPECTS(load_factor > 0 && load_factor <= 1,
                "Invalid load factor: must be greater than 0 and less than or equal to 1.",
                std::invalid_argument);
@@ -138,9 +138,9 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
   if (_is_empty) { return; }
 
   auto const row_bitmask =
-    cudf::detail::bitmask_and(build, stream, cudf::get_current_device_resource_ref()).first;
-  cudf::detail::build_hash_join(_build,
-                                _preprocessed_build,
+    cudf::detail::bitmask_and(right, stream, cudf::get_current_device_resource_ref()).first;
+  cudf::detail::build_hash_join(_right,
+                                _preprocessed_right,
                                 _impl->_hash_table,
                                 _has_nulls,
                                 _nulls_equal,
@@ -148,12 +148,12 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
                                 stream);
 }
 
-template hash_join<hash_join_hasher>::hash_join(cudf::table_view const& build,
+template hash_join<hash_join_hasher>::hash_join(cudf::table_view const& right,
                                                 bool has_nulls,
                                                 cudf::null_equality compare_nulls,
                                                 rmm::cuda_stream_view stream);
 
-template hash_join<hash_join_hasher>::hash_join(cudf::table_view const& build,
+template hash_join<hash_join_hasher>::hash_join(cudf::table_view const& right,
                                                 bool has_nulls,
                                                 cudf::null_equality compare_nulls,
                                                 double load_factor,
@@ -170,93 +170,93 @@ namespace cudf {
 
 hash_join::~hash_join() = default;
 
-hash_join::hash_join(cudf::table_view const& build,
+hash_join::hash_join(cudf::table_view const& right,
                      null_equality compare_nulls,
                      rmm::cuda_stream_view stream)
   : hash_join(
-      build, nullable_join::YES, compare_nulls, cudf::detail::CUCO_DESIRED_LOAD_FACTOR, stream)
+      right, nullable_join::YES, compare_nulls, cudf::detail::CUCO_DESIRED_LOAD_FACTOR, stream)
 {
 }
 
-hash_join::hash_join(cudf::table_view const& build,
+hash_join::hash_join(cudf::table_view const& right,
                      nullable_join has_nulls,
                      null_equality compare_nulls,
                      double load_factor,
                      rmm::cuda_stream_view stream)
   : _impl{std::make_unique<impl_type const>(
-      build, has_nulls == nullable_join::YES, compare_nulls, load_factor, stream)}
+      right, has_nulls == nullable_join::YES, compare_nulls, load_factor, stream)}
 {
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::inner_join(cudf::table_view const& probe,
+hash_join::inner_join(cudf::table_view const& left,
                       std::optional<std::size_t> output_size,
                       rmm::cuda_stream_view stream,
                       rmm::device_async_resource_ref mr) const
 {
-  return _impl->inner_join(probe, output_size, stream, mr);
+  return _impl->inner_join(left, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::left_join(cudf::table_view const& probe,
+hash_join::left_join(cudf::table_view const& left,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
                      rmm::device_async_resource_ref mr) const
 {
-  return _impl->left_join(probe, output_size, stream, mr);
+  return _impl->left_join(left, output_size, stream, mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join::full_join(cudf::table_view const& probe,
+hash_join::full_join(cudf::table_view const& left,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
                      rmm::device_async_resource_ref mr) const
 {
-  return _impl->full_join(probe, output_size, stream, mr);
+  return _impl->full_join(left, output_size, stream, mr);
 }
 
-std::size_t hash_join::inner_join_size(cudf::table_view const& probe,
+std::size_t hash_join::inner_join_size(cudf::table_view const& left,
                                        rmm::cuda_stream_view stream) const
 {
-  return _impl->inner_join_size(probe, stream);
+  return _impl->inner_join_size(left, stream);
 }
 
-std::size_t hash_join::left_join_size(cudf::table_view const& probe,
+std::size_t hash_join::left_join_size(cudf::table_view const& left,
                                       rmm::cuda_stream_view stream) const
 {
-  return _impl->left_join_size(probe, stream);
+  return _impl->left_join_size(left, stream);
 }
 
-std::size_t hash_join::full_join_size(cudf::table_view const& probe,
+std::size_t hash_join::full_join_size(cudf::table_view const& left,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr) const
 {
-  return _impl->full_join_size(probe, stream, mr);
+  return _impl->full_join_size(left, stream, mr);
 }
 
 cudf::join_match_context hash_join::inner_join_match_context(
-  cudf::table_view const& probe,
+  cudf::table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const
 {
-  return _impl->inner_join_match_context(probe, stream, mr);
+  return _impl->inner_join_match_context(left, stream, mr);
 }
 
-cudf::join_match_context hash_join::left_join_match_context(cudf::table_view const& probe,
+cudf::join_match_context hash_join::left_join_match_context(cudf::table_view const& left,
                                                             rmm::cuda_stream_view stream,
                                                             rmm::device_async_resource_ref mr) const
 {
-  return _impl->left_join_match_context(probe, stream, mr);
+  return _impl->left_join_match_context(left, stream, mr);
 }
 
-cudf::join_match_context hash_join::full_join_match_context(cudf::table_view const& probe,
+cudf::join_match_context hash_join::full_join_match_context(cudf::table_view const& left,
                                                             rmm::cuda_stream_view stream,
                                                             rmm::device_async_resource_ref mr) const
 {
-  return _impl->full_join_match_context(probe, stream, mr);
+  return _impl->full_join_match_context(left, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/hash_join/inner_join_match_context.cpp b/cpp/src/join/hash_join/inner_join_match_context.cpp
index 93b2ee7da5a..cf107c7a422 100644
--- a/cpp/src/join/hash_join/inner_join_match_context.cpp
+++ b/cpp/src/join/hash_join/inner_join_match_context.cpp
@@ -12,17 +12,16 @@ namespace cudf::detail {
 
 template <typename Hasher>
 cudf::join_match_context hash_join<Hasher>::inner_join_match_context(
-  cudf::table_view const& probe,
+  cudf::table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"hash_join::inner_join_match_context"};
-  return cudf::join_match_context{probe,
-                                  make_match_counts(join_kind::INNER_JOIN, probe, stream, mr)};
+  return cudf::join_match_context{left, make_match_counts(join_kind::INNER_JOIN, left, stream, mr)};
 }
 
 template cudf::join_match_context cudf::hash_join::impl_type::inner_join_match_context(
-  cudf::table_view const& probe,
+  cudf::table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const;
 
diff --git a/cpp/src/join/hash_join/inner_join_retrieve.cu b/cpp/src/join/hash_join/inner_join_retrieve.cu
index aad679d216c..57386367a81 100644
--- a/cpp/src/join/hash_join/inner_join_retrieve.cu
+++ b/cpp/src/join/hash_join/inner_join_retrieve.cu
@@ -10,17 +10,17 @@ namespace cudf::detail {
 template <typename Hasher>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join<Hasher>::inner_join(cudf::table_view const& probe,
+hash_join<Hasher>::inner_join(cudf::table_view const& left,
                               std::optional<std::size_t> output_size,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr) const
 {
-  return this->template join_retrieve<join_kind::INNER_JOIN>(probe, output_size, stream, mr);
+  return this->template join_retrieve<join_kind::INNER_JOIN>(left, output_size, stream, mr);
 }
 
 template std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                    std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join<hash_join_hasher>::inner_join(cudf::table_view const& probe,
+hash_join<hash_join_hasher>::inner_join(cudf::table_view const& left,
                                         std::optional<std::size_t> output_size,
                                         rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr) const;
diff --git a/cpp/src/join/hash_join/inner_join_size.cu b/cpp/src/join/hash_join/inner_join_size.cu
index 3bcd250a80c..2fedd93593e 100644
--- a/cpp/src/join/hash_join/inner_join_size.cu
+++ b/cpp/src/join/hash_join/inner_join_size.cu
@@ -8,13 +8,13 @@
 namespace cudf::detail {
 
 template <typename Hasher>
-std::size_t hash_join<Hasher>::inner_join_size(cudf::table_view const& probe,
+std::size_t hash_join<Hasher>::inner_join_size(cudf::table_view const& left,
                                                rmm::cuda_stream_view stream) const
 {
-  return this->template join_size<join_kind::INNER_JOIN>(probe, stream);
+  return this->template join_size<join_kind::INNER_JOIN>(left, stream);
 }
 
 template std::size_t hash_join<hash_join_hasher>::inner_join_size(
-  cudf::table_view const& probe, rmm::cuda_stream_view stream) const;
+  cudf::table_view const& left, rmm::cuda_stream_view stream) const;
 
 }  // namespace cudf::detail
diff --git a/cpp/src/join/hash_join/left_join_match_context.cpp b/cpp/src/join/hash_join/left_join_match_context.cpp
index 86cc963ec3f..59bb6255a79 100644
--- a/cpp/src/join/hash_join/left_join_match_context.cpp
+++ b/cpp/src/join/hash_join/left_join_match_context.cpp
@@ -12,17 +12,16 @@ namespace cudf::detail {
 
 template <typename Hasher>
 cudf::join_match_context hash_join<Hasher>::left_join_match_context(
-  cudf::table_view const& probe,
+  cudf::table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"hash_join::left_join_match_context"};
-  return cudf::join_match_context{probe,
-                                  make_match_counts(join_kind::LEFT_JOIN, probe, stream, mr)};
+  return cudf::join_match_context{left, make_match_counts(join_kind::LEFT_JOIN, left, stream, mr)};
 }
 
 template cudf::join_match_context cudf::hash_join::impl_type::left_join_match_context(
-  cudf::table_view const& probe,
+  cudf::table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const;
 
diff --git a/cpp/src/join/hash_join/left_join_retrieve.cu b/cpp/src/join/hash_join/left_join_retrieve.cu
index 5737703aba8..d84eb05cd4c 100644
--- a/cpp/src/join/hash_join/left_join_retrieve.cu
+++ b/cpp/src/join/hash_join/left_join_retrieve.cu
@@ -10,17 +10,17 @@ namespace cudf::detail {
 template <typename Hasher>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join<Hasher>::left_join(cudf::table_view const& probe,
+hash_join<Hasher>::left_join(cudf::table_view const& left,
                              std::optional<std::size_t> output_size,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr) const
 {
-  return this->template join_retrieve<join_kind::LEFT_JOIN>(probe, output_size, stream, mr);
+  return this->template join_retrieve<join_kind::LEFT_JOIN>(left, output_size, stream, mr);
 }
 
 template std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                    std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join<hash_join_hasher>::left_join(cudf::table_view const& probe,
+hash_join<hash_join_hasher>::left_join(cudf::table_view const& left,
                                        std::optional<std::size_t> output_size,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr) const;
diff --git a/cpp/src/join/hash_join/left_join_size.cu b/cpp/src/join/hash_join/left_join_size.cu
index 24e9c64dc5d..4bc382165d3 100644
--- a/cpp/src/join/hash_join/left_join_size.cu
+++ b/cpp/src/join/hash_join/left_join_size.cu
@@ -8,13 +8,13 @@
 namespace cudf::detail {
 
 template <typename Hasher>
-std::size_t hash_join<Hasher>::left_join_size(cudf::table_view const& probe,
+std::size_t hash_join<Hasher>::left_join_size(cudf::table_view const& left,
                                               rmm::cuda_stream_view stream) const
 {
-  return this->template join_size<join_kind::LEFT_JOIN>(probe, stream);
+  return this->template join_size<join_kind::LEFT_JOIN>(left, stream);
 }
 
 template std::size_t hash_join<hash_join_hasher>::left_join_size(
-  cudf::table_view const& probe, rmm::cuda_stream_view stream) const;
+  cudf::table_view const& left, rmm::cuda_stream_view stream) const;
 
 }  // namespace cudf::detail
diff --git a/cpp/src/join/hash_join/match_context.cu b/cpp/src/join/hash_join/match_context.cu
index f990b3b5c42..b6653be623c 100644
--- a/cpp/src/join/hash_join/match_context.cu
+++ b/cpp/src/join/hash_join/match_context.cu
@@ -25,19 +25,18 @@ struct clamp_zero_to_one {
 }  // namespace
 
 std::unique_ptr<rmm::device_uvector<size_type>> make_join_match_counts(
-  table_view const& build,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_build,
+  table_view const& right,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_right,
   cudf::detail::hash_table_t const& hash_table,
   bool is_empty,
   bool has_nulls,
   null_equality compare_nulls,
   join_kind join,
-  table_view const& probe,
+  table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  auto match_counts =
-    std::make_unique<rmm::device_uvector<size_type>>(probe.num_rows(), stream, mr);
+  auto match_counts = std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
 
   if (is_empty) {
     thrust::fill(rmm::exec_policy_nosync(stream, cudf::get_current_device_resource_ref()),
@@ -47,19 +46,19 @@ std::unique_ptr<rmm::device_uvector<size_type>> make_join_match_counts(
     return match_counts;
   }
 
-  CUDF_EXPECTS(has_nulls || !cudf::has_nested_nulls(probe),
-               "Probe table has nulls while build table was not hashed with null check.",
+  CUDF_EXPECTS(has_nulls || !cudf::has_nested_nulls(left),
+               "Left table has nulls while right table was not hashed with null check.",
                std::invalid_argument);
 
-  auto const preprocessed_probe =
-    cudf::detail::row::equality::preprocessed_table::create(probe, stream);
-  auto const probe_table_num_rows = probe.num_rows();
+  auto const preprocessed_left =
+    cudf::detail::row::equality::preprocessed_table::create(left, stream);
+  auto const left_table_num_rows = left.num_rows();
 
   auto count_matches = [&](auto equality, auto d_hasher) {
     auto const iter = cudf::detail::make_counting_transform_iterator(0, pair_fn{d_hasher});
     if (join == join_kind::INNER_JOIN) {
       hash_table.count_each(iter,
-                            iter + probe_table_num_rows,
+                            iter + left_table_num_rows,
                             equality,
                             hash_table.hash_function(),
                             match_counts->begin(),
@@ -69,7 +68,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> make_join_match_counts(
       auto const output =
         thrust::make_transform_output_iterator(match_counts->begin(), clamp_zero_to_one{});
       hash_table.count_each(iter,
-                            iter + probe_table_num_rows,
+                            iter + left_table_num_rows,
                             equality,
                             hash_table.hash_function(),
                             output,
@@ -78,7 +77,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> make_join_match_counts(
   };
 
   dispatch_join_comparator(
-    build, probe, preprocessed_build, preprocessed_probe, has_nulls, compare_nulls, count_matches);
+    right, left, preprocessed_right, preprocessed_left, has_nulls, compare_nulls, count_matches);
 
   return match_counts;
 }
@@ -86,18 +85,18 @@ std::unique_ptr<rmm::device_uvector<size_type>> make_join_match_counts(
 template <typename Hasher>
 std::unique_ptr<rmm::device_uvector<size_type>> hash_join<Hasher>::make_match_counts(
   join_kind join,
-  cudf::table_view const& probe,
+  cudf::table_view const& left,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const
 {
-  return make_join_match_counts(_build,
-                                _preprocessed_build,
+  return make_join_match_counts(_right,
+                                _preprocessed_right,
                                 _impl->_hash_table,
                                 _is_empty,
                                 _has_nulls,
                                 _nulls_equal,
                                 join,
-                                probe,
+                                left,
                                 stream,
                                 mr);
 }
diff --git a/cpp/src/join/hash_join/retrieve_impl.cuh b/cpp/src/join/hash_join/retrieve_impl.cuh
index d365710a73c..58b29562a2c 100644
--- a/cpp/src/join/hash_join/retrieve_impl.cuh
+++ b/cpp/src/join/hash_join/retrieve_impl.cuh
@@ -23,10 +23,10 @@ template <join_kind Join>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 probe_join_hash_table(
-  cudf::table_view const& build_table,
-  cudf::table_view const& probe_table,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_build,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_probe,
+  cudf::table_view const& right_table,
+  cudf::table_view const& left_table,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_right,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_left,
   cudf::detail::hash_table_t const& hash_table,
   bool has_nulls,
   null_equality compare_nulls,
@@ -41,10 +41,10 @@ probe_join_hash_table(
 
   std::size_t const join_size = output_size
                                   ? *output_size
-                                  : compute_join_output_size<size_join>(build_table,
-                                                                        probe_table,
-                                                                        preprocessed_build,
-                                                                        preprocessed_probe,
+                                  : compute_join_output_size<size_join>(right_table,
+                                                                        left_table,
+                                                                        preprocessed_right,
+                                                                        preprocessed_left,
                                                                         hash_table,
                                                                         has_nulls,
                                                                         compare_nulls,
@@ -60,7 +60,7 @@ probe_join_hash_table(
   cudf::prefetch::detail::prefetch(*left_indices, stream);
   cudf::prefetch::detail::prefetch(*right_indices, stream);
 
-  auto const probe_table_num_rows = probe_table.num_rows();
+  auto const left_table_num_rows = left_table.num_rows();
   auto const out_probe_begin =
     thrust::make_transform_output_iterator(left_indices->begin(), output_fn{});
   auto const out_build_begin =
@@ -70,7 +70,7 @@ probe_join_hash_table(
     auto const iter = cudf::detail::make_counting_transform_iterator(0, pair_fn{d_hasher});
     if constexpr (Join == join_kind::INNER_JOIN) {
       hash_table.retrieve(iter,
-                          iter + probe_table_num_rows,
+                          iter + left_table_num_rows,
                           equality,
                           hash_table.hash_function(),
                           out_probe_begin,
@@ -79,7 +79,7 @@ probe_join_hash_table(
     } else {
       [[maybe_unused]] auto out_probe_end = hash_table
                                               .retrieve_outer(iter,
-                                                              iter + probe_table_num_rows,
+                                                              iter + left_table_num_rows,
                                                               equality,
                                                               hash_table.hash_function(),
                                                               out_probe_begin,
@@ -95,10 +95,10 @@ probe_join_hash_table(
     }
   };
 
-  dispatch_join_comparator(build_table,
-                           probe_table,
-                           preprocessed_build,
-                           preprocessed_probe,
+  dispatch_join_comparator(right_table,
+                           left_table,
+                           preprocessed_right,
+                           preprocessed_left,
                            has_nulls,
                            compare_nulls,
                            retrieve_results);
@@ -108,22 +108,22 @@ probe_join_hash_table(
 
 template <typename RightOutputIterator>
 void retrieve_left_join_build_indices(
-  cudf::table_view const& build_table,
-  cudf::table_view const& probe_table,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_build,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_probe,
+  cudf::table_view const& right_table,
+  cudf::table_view const& left_table,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_right,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_left,
   cudf::detail::hash_table_t const& hash_table,
   bool has_nulls,
   null_equality compare_nulls,
   RightOutputIterator out_build_begin,
   rmm::cuda_stream_view stream)
 {
-  auto const probe_table_num_rows = probe_table.num_rows();
+  auto const left_table_num_rows = left_table.num_rows();
 
   auto retrieve_results = [&](auto equality, auto d_hasher) {
     auto const iter = cudf::detail::make_counting_transform_iterator(0, pair_fn{d_hasher});
     hash_table.retrieve_outer(iter,
-                              iter + probe_table_num_rows,
+                              iter + left_table_num_rows,
                               equality,
                               hash_table.hash_function(),
                               cuda::make_discard_iterator(),
@@ -131,10 +131,10 @@ void retrieve_left_join_build_indices(
                               stream.value());
   };
 
-  dispatch_join_comparator(build_table,
-                           probe_table,
-                           preprocessed_build,
-                           preprocessed_probe,
+  dispatch_join_comparator(right_table,
+                           left_table,
+                           preprocessed_right,
+                           preprocessed_left,
                            has_nulls,
                            compare_nulls,
                            retrieve_results);
@@ -144,36 +144,36 @@ template <typename Hasher>
 template <join_kind Join>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-hash_join<Hasher>::join_retrieve(cudf::table_view const& probe,
+hash_join<Hasher>::join_retrieve(cudf::table_view const& left,
                                  std::optional<std::size_t> output_size,
                                  rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
 
-  validate_hash_join_probe(_build, probe, _has_nulls);
+  validate_hash_join_probe(_right, left, _has_nulls);
 
   if constexpr (Join == join_kind::INNER_JOIN) {
-    if (is_trivial_join(probe, _build, Join)) {
+    if (is_trivial_join(left, _right, Join)) {
       return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
                        std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
     }
   } else {
-    if (_is_empty) { return get_trivial_left_join_indices(probe, stream, mr); }
+    if (_is_empty) { return get_trivial_left_join_indices(left, stream, mr); }
 
-    if (is_trivial_join(probe, _build, Join)) {
+    if (is_trivial_join(left, _right, Join)) {
       return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
                        std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
     }
   }
 
-  auto const preprocessed_probe =
-    cudf::detail::row::equality::preprocessed_table::create(probe, stream);
+  auto const preprocessed_left =
+    cudf::detail::row::equality::preprocessed_table::create(left, stream);
 
-  auto join_indices = cudf::detail::probe_join_hash_table<Join>(_build,
-                                                                probe,
-                                                                _preprocessed_build,
-                                                                preprocessed_probe,
+  auto join_indices = cudf::detail::probe_join_hash_table<Join>(_right,
+                                                                left,
+                                                                _preprocessed_right,
+                                                                preprocessed_left,
                                                                 _impl->_hash_table,
                                                                 _has_nulls,
                                                                 _nulls_equal,
@@ -183,7 +183,7 @@ hash_join<Hasher>::join_retrieve(cudf::table_view const& probe,
 
   if constexpr (Join == join_kind::FULL_JOIN) {
     auto complement_indices = detail::get_left_join_indices_complement(
-      join_indices.second, probe.num_rows(), _build.num_rows(), stream, mr);
+      join_indices.second, left.num_rows(), _right.num_rows(), stream, mr);
     return detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
   } else {
     return join_indices;
diff --git a/cpp/src/join/hash_join/size_impl.cuh b/cpp/src/join/hash_join/size_impl.cuh
index 6fb2da63d4f..3e20ebc7367 100644
--- a/cpp/src/join/hash_join/size_impl.cuh
+++ b/cpp/src/join/hash_join/size_impl.cuh
@@ -13,10 +13,10 @@
 namespace cudf::detail {
 
 std::size_t get_full_join_size(
-  cudf::table_view const& build_table,
-  cudf::table_view const& probe_table,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_build,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_probe,
+  cudf::table_view const& right_table,
+  cudf::table_view const& left_table,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_right,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_left,
   cudf::detail::hash_table_t const& hash_table,
   bool has_nulls,
   null_equality compare_nulls,
@@ -25,10 +25,10 @@ std::size_t get_full_join_size(
 
 template <join_kind Join>
 std::size_t compute_join_output_size(
-  table_view const& build_table,
-  table_view const& probe_table,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_build,
-  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_probe,
+  table_view const& right_table,
+  table_view const& left_table,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_right,
+  std::shared_ptr<cudf::detail::row::equality::preprocessed_table> const& preprocessed_left,
   cudf::detail::hash_table_t const& hash_table,
   bool has_nulls,
   cudf::null_equality nulls_equal,
@@ -36,34 +36,34 @@ std::size_t compute_join_output_size(
 {
   static_assert(Join == join_kind::INNER_JOIN || Join == join_kind::LEFT_JOIN);
 
-  if (build_table.num_rows() == 0) {
-    return Join == join_kind::INNER_JOIN ? 0 : probe_table.num_rows();
+  if (right_table.num_rows() == 0) {
+    return Join == join_kind::INNER_JOIN ? 0 : left_table.num_rows();
   }
 
-  auto const probe_table_num_rows = probe_table.num_rows();
+  auto const left_table_num_rows = left_table.num_rows();
 
   return dispatch_join_comparator(
-    build_table,
-    probe_table,
-    preprocessed_build,
-    preprocessed_probe,
+    right_table,
+    left_table,
+    preprocessed_right,
+    preprocessed_left,
     has_nulls,
     nulls_equal,
     [&](auto equality, auto d_hasher) {
       auto const iter = cudf::detail::make_counting_transform_iterator(0, pair_fn{d_hasher});
       if constexpr (Join == join_kind::LEFT_JOIN) {
         return hash_table.count_outer(
-          iter, iter + probe_table_num_rows, equality, hash_table.hash_function(), stream.value());
+          iter, iter + left_table_num_rows, equality, hash_table.hash_function(), stream.value());
       } else {
         return hash_table.count(
-          iter, iter + probe_table_num_rows, equality, hash_table.hash_function(), stream.value());
+          iter, iter + left_table_num_rows, equality, hash_table.hash_function(), stream.value());
       }
     });
 }
 
 template <typename Hasher>
 template <join_kind Join>
-std::size_t hash_join<Hasher>::join_size(cudf::table_view const& probe,
+std::size_t hash_join<Hasher>::join_size(cudf::table_view const& left,
                                          rmm::cuda_stream_view stream) const
 {
   static_assert(Join == join_kind::INNER_JOIN || Join == join_kind::LEFT_JOIN);
@@ -73,20 +73,20 @@ std::size_t hash_join<Hasher>::join_size(cudf::table_view const& probe,
   if constexpr (Join == join_kind::INNER_JOIN) {
     if (_is_empty) { return 0; }
   } else {
-    if (_is_empty) { return probe.num_rows(); }
+    if (_is_empty) { return left.num_rows(); }
   }
 
-  CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe),
-               "Probe table has nulls while build table was not hashed with null check.",
+  CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(left),
+               "Left table has nulls while right table was not hashed with null check.",
                std::invalid_argument);
 
-  auto const preprocessed_probe =
-    cudf::detail::row::equality::preprocessed_table::create(probe, stream);
+  auto const preprocessed_left =
+    cudf::detail::row::equality::preprocessed_table::create(left, stream);
 
-  return cudf::detail::compute_join_output_size<Join>(_build,
-                                                      probe,
-                                                      _preprocessed_build,
-                                                      preprocessed_probe,
+  return cudf::detail::compute_join_output_size<Join>(_right,
+                                                      left,
+                                                      _preprocessed_right,
+                                                      preprocessed_left,
                                                       _impl->_hash_table,
                                                       _has_nulls,
                                                       _nulls_equal,
@@ -95,7 +95,7 @@ std::size_t hash_join<Hasher>::join_size(cudf::table_view const& probe,
 
 template <typename Hasher>
 template <join_kind Join>
-std::size_t hash_join<Hasher>::join_size(cudf::table_view const& probe,
+std::size_t hash_join<Hasher>::join_size(cudf::table_view const& left,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr) const
 {
@@ -103,19 +103,19 @@ std::size_t hash_join<Hasher>::join_size(cudf::table_view const& probe,
 
   CUDF_FUNC_RANGE();
 
-  if (_is_empty) { return probe.num_rows(); }
+  if (_is_empty) { return left.num_rows(); }
 
-  CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(probe),
-               "Probe table has nulls while build table was not hashed with null check.",
+  CUDF_EXPECTS(_has_nulls || !cudf::has_nested_nulls(left),
+               "Left table has nulls while right table was not hashed with null check.",
                std::invalid_argument);
 
-  auto const preprocessed_probe =
-    cudf::detail::row::equality::preprocessed_table::create(probe, stream);
+  auto const preprocessed_left =
+    cudf::detail::row::equality::preprocessed_table::create(left, stream);
 
-  return cudf::detail::get_full_join_size(_build,
-                                          probe,
-                                          _preprocessed_build,
-                                          preprocessed_probe,
+  return cudf::detail::get_full_join_size(_right,
+                                          left,
+                                          _preprocessed_right,
+                                          preprocessed_left,
                                           _impl->_hash_table,
                                           _has_nulls,
                                           _nulls_equal,
diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp
index e3864f06e97..744caddeb49 100644
--- a/cpp/tests/join/distinct_join_tests.cpp
+++ b/cpp/tests/join/distinct_join_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -39,8 +39,8 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> get_left_indices(cudf::siz
 
 struct DistinctJoinTest : public cudf::test::BaseFixture {
   void compare_to_reference(
-    cudf::table_view const& build_table,
-    cudf::table_view const& probe_table,
+    cudf::table_view const& right_table,
+    cudf::table_view const& left_table,
     std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
               std::unique_ptr<rmm::device_uvector<cudf::size_type>>> const& result,
     cudf::table_view const& expected_table,
@@ -54,8 +54,8 @@ struct DistinctJoinTest : public cudf::test::BaseFixture {
     auto build_indices_col = cudf::column_view{build_indices_span};
     auto probe_indices_col = cudf::column_view{probe_indices_span};
 
-    auto joined_cols = cudf::gather(probe_table, probe_indices_col, oob_policy)->release();
-    auto right_cols  = cudf::gather(build_table, build_indices_col, oob_policy)->release();
+    auto joined_cols = cudf::gather(left_table, probe_indices_col, oob_policy)->release();
+    auto right_cols  = cudf::gather(right_table, build_indices_col, oob_policy)->release();
 
     joined_cols.insert(joined_cols.end(),
                        std::make_move_iterator(right_cols.begin()),
@@ -76,19 +76,19 @@ TEST_F(DistinctJoinTest, IntegerInnerJoin)
 
   auto const init = cudf::numeric_scalar<int32_t>{0};
 
-  auto build = cudf::sequence(size, init, cudf::numeric_scalar<int32_t>{1});
-  auto probe = cudf::sequence(size, init, cudf::numeric_scalar<int32_t>{2});
+  auto right = cudf::sequence(size, init, cudf::numeric_scalar<int32_t>{1});
+  auto left  = cudf::sequence(size, init, cudf::numeric_scalar<int32_t>{2});
 
-  auto build_table = cudf::table_view{{build->view()}};
-  auto probe_table = cudf::table_view{{probe->view()}};
+  auto right_table = cudf::table_view{{right->view()}};
+  auto left_table  = cudf::table_view{{left->view()}};
 
-  auto distinct_join = cudf::distinct_hash_join{build_table};
+  auto distinct_join = cudf::distinct_hash_join{right_table};
 
-  auto result = distinct_join.inner_join(probe_table);
+  auto result = distinct_join.inner_join(left_table);
 
   auto constexpr gold_size = size / 2;
   auto gold                = cudf::sequence(gold_size, init, cudf::numeric_scalar<int32_t>{2});
-  this->compare_to_reference(build_table, probe_table, result, cudf::table_view{{gold->view()}});
+  this->compare_to_reference(right_table, left_table, result, cudf::table_view{{gold->view()}});
 }
 
 TEST_F(DistinctJoinTest, InnerJoinNoNulls)
@@ -109,11 +109,11 @@ TEST_F(DistinctJoinTest, InnerJoinNoNulls)
   cols1.push_back(col1_1.release());
   cols1.push_back(col1_2.release());
 
-  Table build(std::move(cols0));
-  Table probe(std::move(cols1));
+  Table right(std::move(cols0));
+  Table left(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.inner_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.inner_join(left.view());
 
   column_wrapper<int32_t> col_gold_0{{1, 2}};
   strcol_wrapper col_gold_1({"s0", "s0"});
@@ -130,7 +130,7 @@ TEST_F(DistinctJoinTest, InnerJoinNoNulls)
   cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
-  this->compare_to_reference(build.view(), probe.view(), result, gold.view());
+  this->compare_to_reference(right.view(), left.view(), result, gold.view());
 }
 
 TEST_F(DistinctJoinTest, PrimitiveInnerJoinNoNulls)
@@ -151,11 +151,11 @@ TEST_F(DistinctJoinTest, PrimitiveInnerJoinNoNulls)
   cols1.push_back(col1_1.release());
   cols1.push_back(col1_2.release());
 
-  Table build(std::move(cols0));
-  Table probe(std::move(cols1));
+  Table right(std::move(cols0));
+  Table left(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.inner_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.inner_join(left.view());
 
   column_wrapper<int32_t> col_gold_0{{1, 2}};
   column_wrapper<int32_t> col_gold_1({0, 0});
@@ -172,7 +172,7 @@ TEST_F(DistinctJoinTest, PrimitiveInnerJoinNoNulls)
   cols_gold.push_back(col_gold_5.release());
   Table gold(std::move(cols_gold));
 
-  this->compare_to_reference(build.view(), probe.view(), result, gold.view());
+  this->compare_to_reference(right.view(), left.view(), result, gold.view());
 }
 
 TEST_F(DistinctJoinTest, InnerJoinWithNulls)
@@ -193,8 +193,8 @@ TEST_F(DistinctJoinTest, InnerJoinWithNulls)
   cols1.push_back(col1_1.release());
   cols1.push_back(col1_2.release());
 
-  Table build(std::move(cols0));
-  Table probe(std::move(cols1));
+  Table right(std::move(cols0));
+  Table left(std::move(cols1));
 
   // Create gold table once
   column_wrapper<int32_t> col_gold_0{{3, 2}};
@@ -217,10 +217,10 @@ TEST_F(DistinctJoinTest, InnerJoinWithNulls)
 
   for (auto load_factor : load_factors) {
     auto distinct_join =
-      cudf::distinct_hash_join{build.view(), cudf::null_equality::EQUAL, load_factor};
-    auto result = distinct_join.inner_join(probe.view());
+      cudf::distinct_hash_join{right.view(), cudf::null_equality::EQUAL, load_factor};
+    auto result = distinct_join.inner_join(left.view());
 
-    this->compare_to_reference(build.view(), probe.view(), result, gold.view());
+    this->compare_to_reference(right.view(), left.view(), result, gold.view());
   }
 }
 
@@ -242,8 +242,8 @@ TEST_F(DistinctJoinTest, PrimitiveInnerJoinWithNulls)
   cols1.push_back(col1_1.release());
   cols1.push_back(col1_2.release());
 
-  Table build(std::move(cols0));
-  Table probe(std::move(cols1));
+  Table right(std::move(cols0));
+  Table left(std::move(cols1));
 
   // Create gold table once
   column_wrapper<int32_t> col_gold_0{{3, 2}};
@@ -266,10 +266,10 @@ TEST_F(DistinctJoinTest, PrimitiveInnerJoinWithNulls)
 
   for (auto load_factor : load_factors) {
     auto distinct_join =
-      cudf::distinct_hash_join{build.view(), cudf::null_equality::EQUAL, load_factor};
-    auto result = distinct_join.inner_join(probe.view());
+      cudf::distinct_hash_join{right.view(), cudf::null_equality::EQUAL, load_factor};
+    auto result = distinct_join.inner_join(left.view());
 
-    this->compare_to_reference(build.view(), probe.view(), result, gold.view());
+    this->compare_to_reference(right.view(), left.view(), result, gold.view());
   }
 }
 
@@ -316,11 +316,11 @@ TEST_F(DistinctJoinTest, InnerJoinWithStructsAndNulls)
   cols1.push_back(col1_2.release());
   cols1.push_back(col1_3.release());
 
-  Table probe(std::move(cols0));
-  Table build(std::move(cols1));
+  Table left(std::move(cols0));
+  Table right(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.inner_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.inner_join(left.view());
 
   column_wrapper<int32_t> col_gold_0{{3, 2}};
   strcol_wrapper col_gold_1({"s1", "s0"}, {true, true});
@@ -354,10 +354,10 @@ TEST_F(DistinctJoinTest, InnerJoinWithStructsAndNulls)
   cols_gold.push_back(col_gold_7.release());
   Table gold(std::move(cols_gold));
 
-  this->compare_to_reference(build.view(), probe.view(), result, gold.view());
+  this->compare_to_reference(right.view(), left.view(), result, gold.view());
 }
 
-TEST_F(DistinctJoinTest, EmptyBuildTableInnerJoin)
+TEST_F(DistinctJoinTest, EmptyRightTableInnerJoin)
 {
   column_wrapper<int32_t> col0_0;
   column_wrapper<int32_t> col0_1;
@@ -371,16 +371,16 @@ TEST_F(DistinctJoinTest, EmptyBuildTableInnerJoin)
   cols1.push_back(col1_0.release());
   cols1.push_back(col1_1.release());
 
-  Table build(std::move(cols0));
-  Table probe(std::move(cols1));
+  Table right(std::move(cols0));
+  Table left(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.inner_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.inner_join(left.view());
 
-  this->compare_to_reference(build.view(), probe.view(), result, build.view());
+  this->compare_to_reference(right.view(), left.view(), result, right.view());
 }
 
-TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin)
+TEST_F(DistinctJoinTest, EmptyRightTableLeftJoin)
 {
   column_wrapper<int32_t> col0_0;
   column_wrapper<int32_t> col0_1;
@@ -394,18 +394,18 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin)
   cols1.push_back(col1_0.release());
   cols1.push_back(col1_1.release());
 
-  Table build(std::move(cols0));
-  Table probe(std::move(cols1));
+  Table right(std::move(cols0));
+  Table left(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.left_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.left_join(left.view());
   auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   this->compare_to_reference(
-    build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY);
+    right.view(), left.view(), gather_map, left.view(), cudf::out_of_bounds_policy::NULLIFY);
 }
 
-TEST_F(DistinctJoinTest, EmptyProbeTableInnerJoin)
+TEST_F(DistinctJoinTest, EmptyLeftTableInnerJoin)
 {
   column_wrapper<int32_t> col0_0{{2, 2, 0, 4, 3}};
   column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
@@ -419,16 +419,16 @@ TEST_F(DistinctJoinTest, EmptyProbeTableInnerJoin)
   cols1.push_back(col1_0.release());
   cols1.push_back(col1_1.release());
 
-  Table build(std::move(cols0));
-  Table probe(std::move(cols1));
+  Table right(std::move(cols0));
+  Table left(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.inner_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.inner_join(left.view());
 
-  this->compare_to_reference(build.view(), probe.view(), result, probe.view());
+  this->compare_to_reference(right.view(), left.view(), result, left.view());
 }
 
-TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin)
+TEST_F(DistinctJoinTest, EmptyLeftTableLeftJoin)
 {
   column_wrapper<int32_t> col0_0{{2, 2, 0, 4, 3}};
   column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
@@ -442,15 +442,15 @@ TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin)
   cols1.push_back(col1_0.release());
   cols1.push_back(col1_1.release());
 
-  Table build(std::move(cols0));
-  Table probe(std::move(cols1));
+  Table right(std::move(cols0));
+  Table left(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.left_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.left_join(left.view());
   auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   this->compare_to_reference(
-    build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY);
+    right.view(), left.view(), gather_map, left.view(), cudf::out_of_bounds_policy::NULLIFY);
 }
 
 TEST_F(DistinctJoinTest, LeftJoinNoNulls)
@@ -467,8 +467,8 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls)
   cols1.push_back(col1_0.release());
   cols1.push_back(col1_1.release());
 
-  Table probe(std::move(cols0));
-  Table build(std::move(cols1));
+  Table left(std::move(cols0));
+  Table right(std::move(cols1));
 
   column_wrapper<int32_t> col_gold_0({3, 1, 2, 0, 3});
   strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1"});
@@ -481,12 +481,12 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls)
   cols_gold.push_back(col_gold_3.release());
   Table gold(std::move(cols_gold));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.left_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.left_join(left.view());
   auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   this->compare_to_reference(
-    build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
+    right.view(), left.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
 }
 
 TEST_F(DistinctJoinTest, PrimitiveLeftJoinNoNulls)
@@ -503,8 +503,8 @@ TEST_F(DistinctJoinTest, PrimitiveLeftJoinNoNulls)
   cols1.push_back(col1_0.release());
   cols1.push_back(col1_1.release());
 
-  Table probe(std::move(cols0));
-  Table build(std::move(cols1));
+  Table left(std::move(cols0));
+  Table right(std::move(cols1));
 
   column_wrapper<int32_t> col_gold_0({3, 1, 2, 0, 3});
   column_wrapper<int32_t> col_gold_1({0, 1, 2, 4, 1});
@@ -517,12 +517,12 @@ TEST_F(DistinctJoinTest, PrimitiveLeftJoinNoNulls)
   cols_gold.push_back(col_gold_3.release());
   Table gold(std::move(cols_gold));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.left_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.left_join(left.view());
   auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   this->compare_to_reference(
-    build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
+    right.view(), left.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
 }
 
 TEST_F(DistinctJoinTest, LeftJoinWithNulls)
@@ -539,11 +539,11 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls)
   cols1.push_back(col1_0.release());
   cols1.push_back(col1_1.release());
 
-  Table probe(std::move(cols0));
-  Table build(std::move(cols1));
+  Table left(std::move(cols0));
+  Table right(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.left_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.left_join(left.view());
   auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}};
@@ -559,7 +559,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls)
   Table gold(std::move(cols_gold));
 
   this->compare_to_reference(
-    build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
+    right.view(), left.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
 }
 
 TEST_F(DistinctJoinTest, PrimitiveLeftJoinWithNulls)
@@ -576,11 +576,11 @@ TEST_F(DistinctJoinTest, PrimitiveLeftJoinWithNulls)
   cols1.push_back(col1_0.release());
   cols1.push_back(col1_1.release());
 
-  Table probe(std::move(cols0));
-  Table build(std::move(cols1));
+  Table left(std::move(cols0));
+  Table right(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.left_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.left_join(left.view());
   auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}};
@@ -596,7 +596,7 @@ TEST_F(DistinctJoinTest, PrimitiveLeftJoinWithNulls)
   Table gold(std::move(cols_gold));
 
   this->compare_to_reference(
-    build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
+    right.view(), left.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
 }
 
 TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls)
@@ -621,11 +621,11 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls)
   cols0.push_back(col0.release());
   cols1.push_back(col1.release());
 
-  Table probe(std::move(cols0));
-  Table build(std::move(cols1));
+  Table left(std::move(cols0));
+  Table right(std::move(cols1));
 
-  auto distinct_join = cudf::distinct_hash_join{build.view()};
-  auto result        = distinct_join.left_join(probe.view());
+  auto distinct_join = cudf::distinct_hash_join{right.view()};
+  auto result        = distinct_join.left_join(left.view());
   auto gather_map    = std::pair{get_left_indices(result->size()), std::move(result)};
 
   auto col0_gold_names_col = strcol_wrapper{
@@ -658,7 +658,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls)
   Table gold(std::move(cols_gold));
 
   this->compare_to_reference(
-    build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
+    right.view(), left.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY);
 }
 
 TEST_F(DistinctJoinTest, InvalidLoadFactor)
@@ -699,14 +699,14 @@ TEST_F(DistinctJoinTest, DistinctLargeExtentOverflowPrevention)
   auto const init = cudf::numeric_scalar<cudf::size_type>{0};
   auto build_col  = cudf::sequence(table_size, init, cudf::numeric_scalar<cudf::size_type>{1});
 
-  auto build_table = cudf::table_view{{build_col->view()}};
+  auto right_table = cudf::table_view{{build_col->view()}};
   cudf::table empty_probe_table{};
 
   // This should succeed with size_t extent - would have failed with int32_t extent
   // in scenarios approaching the overflow boundary
   EXPECT_NO_THROW({
     auto distinct_join = cudf::distinct_hash_join(
-      build_table, cudf::null_equality::EQUAL, load_factor, cudf::get_default_stream());
+      right_table, cudf::null_equality::EQUAL, load_factor, cudf::get_default_stream());
     auto result = distinct_join.inner_join(empty_probe_table);
   });
 }
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 0afa6de2062..d08c7fcf8a6 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -2401,9 +2401,9 @@ TEST_F(JoinTest, HashJoinFullMatchContext)
   }
 }
 
-TEST_F(JoinTest, HashJoinMatchContextEmptyBuild)
+TEST_F(JoinTest, HashJoinMatchContextEmptyRight)
 {
-  // Test match context with empty build table
+  // Test match context with empty right table
   column_wrapper<int32_t> col0_0{{3, 1, 2}};
   column_wrapper<int32_t> col1_0{};  // Empty
 
@@ -2855,7 +2855,7 @@ struct JoinTestLists : public cudf::test::BaseFixture {
       [],        3
       [5, 6]     4
   */
-  lcw build{{{0}, {1}, {{2, 0}, null_at(1)}, {}, {5, 6}}, null_at(0)};
+  lcw right{{{0}, {1}, {{2, 0}, null_at(1)}, {}, {5, 6}}, null_at(0)};
 
   /*
     [
@@ -2868,7 +2868,7 @@ struct JoinTestLists : public cudf::test::BaseFixture {
       [6]        6
     ]
   */
-  lcw probe{{{1}, {3}, {0}, {}, {{2, 0}, null_at(1)}, {5}, {6}}, null_at(2)};
+  lcw left{{{1}, {3}, {0}, {}, {{2, 0}, null_at(1)}, {5}, {6}}, null_at(2)};
 
   auto column_view_from_device_uvector(rmm::device_uvector<cudf::size_type> const& vector)
   {
@@ -2893,23 +2893,23 @@ struct JoinTestLists : public cudf::test::BaseFixture {
             JoinFunc join_func,
             cudf::out_of_bounds_policy oob_policy)
   {
-    auto const build_tv = cudf::table_view{{build}};
-    auto const probe_tv = cudf::table_view{{probe}};
+    auto const right_tv = cudf::table_view{{right}};
+    auto const left_tv  = cudf::table_view{{left}};
 
     auto const [left_result_map, right_result_map] =
-      join_func(build_tv,
-                probe_tv,
+      join_func(right_tv,
+                left_tv,
                 nulls_equal,
                 cudf::get_default_stream(),
                 cudf::get_current_device_resource_ref());
 
     auto const left_result_table =
-      sort_and_gather(build_tv, column_view_from_device_uvector(*left_result_map), oob_policy);
+      sort_and_gather(right_tv, column_view_from_device_uvector(*left_result_map), oob_policy);
     auto const right_result_table =
-      sort_and_gather(probe_tv, column_view_from_device_uvector(*right_result_map), oob_policy);
+      sort_and_gather(left_tv, column_view_from_device_uvector(*right_result_map), oob_policy);
 
-    auto const left_gold_table  = sort_and_gather(build_tv, left_gold_map, oob_policy);
-    auto const right_gold_table = sort_and_gather(probe_tv, right_gold_map, oob_policy);
+    auto const left_gold_table  = sort_and_gather(right_tv, left_gold_map, oob_policy);
+    auto const right_gold_table = sort_and_gather(left_tv, right_gold_map, oob_policy);
 
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*left_result_table, *left_gold_table);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*right_result_table, *right_gold_table);

From d9195b609800ef03fbc28ee97e42d938d0ad462e Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 6 May 2026 21:00:51 +0000
Subject: [PATCH 22/36] remove pylibcudf calls

---
 python/cudf/cudf/core/groupby/groupby.py | 27 +++++++-----------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 8538953ea7e..816b466653c 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -3018,29 +3018,18 @@ def _bool_reduce(self, op: str, *, skipna: bool, min_count: int):
         # nulls are preserved through the aggregation (min/max skip
         # nulls). For ``skipna=False``, nulls are replaced with True so
         # they don't flip ``all`` to False and always make ``any`` True.
-        def _to_bool_col(col):
-            from cudf.core.column import ColumnBase
+        bool_dtype = np.dtype(np.bool_)
 
+        def _to_bool_col(col):
             if is_dtype_obj_string(col.dtype):
-                counts_plc = plc.strings.attributes.count_characters(
-                    col.plc_column
-                )
-                gt_plc = plc.binaryop.binary_operation(
-                    counts_plc,
-                    plc.Scalar.from_py(0),
-                    plc.binaryop.BinaryOperator.GREATER,
-                    plc.DataType(plc.TypeId.BOOL8),
-                )
-                bool_col = ColumnBase.create(gt_plc, np.dtype(np.bool_))
+                bool_col = col.count_characters() > np.int8(0)
             else:
                 # For numeric/bool inputs, cast to bool preserving nulls.
-                ne_plc = plc.binaryop.binary_operation(
-                    col.plc_column,
-                    plc.Scalar.from_py(0),
-                    plc.binaryop.BinaryOperator.NOT_EQUAL,
-                    plc.DataType(plc.TypeId.BOOL8),
-                )
-                bool_col = ColumnBase.create(ne_plc, np.dtype(np.bool_))
+                bool_col = col != 0
+            # Normalize away pandas-extension bool dtypes so the downstream
+            # aggregation always sees ``np.bool_``.
+            if bool_col.dtype != bool_dtype:
+                bool_col = ColumnBase.create(bool_col.plc_column, bool_dtype)
             if not skipna:
                 bool_col = bool_col.fillna(True)
             return bool_col

From e9dd32bbc4cad7f9ac678d494c921c9a786cda01 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 7 May 2026 21:48:58 -0500
Subject: [PATCH 23/36] Update python/cudf/cudf/core/groupby/groupby.py

Co-authored-by: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
---
 python/cudf/cudf/core/groupby/groupby.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 63b231152ce..153b459f0ba 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -3028,8 +3028,7 @@ def _to_bool_col(col):
                 bool_col = col != 0
             # Normalize away pandas-extension bool dtypes so the downstream
             # aggregation always sees ``np.bool_``.
-            if bool_col.dtype != bool_dtype:
-                bool_col = ColumnBase.create(bool_col.plc_column, bool_dtype)
+            bool_col = bool_col.astype(bool_dtype, copy=False)
             if not skipna:
                 bool_col = bool_col.fillna(True)
             return bool_col

From e03db070fb4ced664e28a4eac5a0fedccbfb43c0 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 6 May 2026 13:22:57 -0700
Subject: [PATCH 24/36] Correctly handle blocks with "block byte size" fields
 in the Avro reader (#22387)

When the number of elements in the Avro block is stored as a negative number, the block also includes its size in bytes. This PR allows the reader to correctly parse such files.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/22387
---
 cpp/src/io/avro/avro.cpp                      |  14 ++++++--
 .../cudf/cudf/tests/data/avro/hang_input.avro | Bin 0 -> 101 bytes
 .../cudf/cudf/tests/input_output/test_avro.py |  32 ++++++++++++++++++
 3 files changed, 43 insertions(+), 3 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/avro/hang_input.avro

diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index bf7d983d481..4639ea6ba23 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -64,8 +64,16 @@ bool container::parse(file_metadata* md, size_t max_num_rows, size_t first_row)
   sig4 |= get_raw<uint8_t>() << 24;
   if (sig4 != avro_magic) { return false; }
   for (;;) {
-    auto num_md_items = static_cast<uint32_t>(get_encoded<int64_t>());
-    if (num_md_items == 0) { break; }
+    auto md_items_signed = get_encoded<int64_t>();
+    if (md_items_signed == 0) { break; }
+    if (md_items_signed < 0) {
+      // A negative count means a block's byte size follows. Read it and discard it.
+      [[maybe_unused]] auto const md_block_size = get_encoded<int64_t>();
+      md_items_signed                           = -md_items_signed;
+    }
+    // Check that the claimed item count can fit in the remaining input
+    if (md_items_signed > (m_end - m_cur) / 2) { return false; }
+    auto const num_md_items = static_cast<uint32_t>(md_items_signed);
     for (uint32_t i = 0; i < num_md_items; i++) {
       auto const key   = get_encoded<std::string>();
       auto const value = get_encoded<std::string>();
diff --git a/python/cudf/cudf/tests/data/avro/hang_input.avro b/python/cudf/cudf/tests/data/avro/hang_input.avro
new file mode 100644
index 0000000000000000000000000000000000000000..b26cb797fe8e1343a7560135f4d79e31902ca1d8
GIT binary patch
literal 101
zcmeZI%3@>^ODrqO*DFrWNX<>`VyspwsVqoUvQjEaP0lY$QPNS$OUwoHfy}hb)SQ%J
pC9CLam}psIPH8Gorlis(G_Aa2CKFT0s@-SqzWdK0sALQg0sxX>BQF2|

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/input_output/test_avro.py b/python/cudf/cudf/tests/input_output/test_avro.py
index f982af4a85a..9820c6c9334 100644
--- a/python/cudf/cudf/tests/input_output/test_avro.py
+++ b/python/cudf/cudf/tests/input_output/test_avro.py
@@ -5,6 +5,9 @@
 import datetime
 import io
 import pathlib
+import subprocess
+import sys
+import textwrap
 
 import fastavro
 import numpy as np
@@ -641,3 +644,32 @@ def test_avro_reader_multiblock(
     actual_df = cudf.read_avro(buffer, skiprows=skip_rows, num_rows=num_rows)
 
     assert_eq(expected_df, actual_df)
+
+
+def test_avro_reader_no_hang_on_truncated_schema(datadir):
+    path = datadir / "avro" / "hang_input.avro"
+    assert path.is_file(), path
+
+    script = textwrap.dedent(
+        f"""
+        import cudf
+        try:
+            cudf.read_avro({str(path)!r})
+        except Exception:
+            pass
+        """
+    )
+
+    timeout_s = 10
+    try:
+        subprocess.run(
+            [sys.executable, "-c", script],
+            timeout=timeout_s,
+            check=False,
+            capture_output=True,
+        )
+    except subprocess.TimeoutExpired:
+        pytest.fail(
+            f"cudf.read_avro hung on malformed input {path.name!r} "
+            f"(no completion within {timeout_s}s)"
+        )

From 62c8c5a99c664474d71eb77238ea75296d28005e Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Wed, 6 May 2026 13:29:07 -0700
Subject: [PATCH 25/36] Use `token.rapids.nvidia.com` when issuing S3 bucket
 creds in devcontainers (#22338)

Set AWS_IDP_URL and update AWS_ROLE_ARN to use `token.rapids.nvidia.com`

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Gil Forsyth (https://github.com/gforsyth)

URL: https://github.com/rapidsai/cudf/pull/22338
---
 .devcontainer/Dockerfile | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 4be36d4402c..b4b2ecb69e0 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -37,7 +37,8 @@ ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAG
 ###
 # sccache configuration
 ###
-ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/nv-gha-token-sccache-devs"
+ENV AWS_IDP_URL="https://token.rapids.nvidia.com"
+ENV AWS_ROLE_ARN="arn:aws:iam::279114543810:role/rapids-token-sccache-devs"
 ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
 ENV SCCACHE_S3_USE_PREPROCESSOR_CACHE_MODE=true

From 6ffe708fa1d5f486abab54aeb53244bf3ff192a8 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Wed, 6 May 2026 16:41:36 -0400
Subject: [PATCH 26/36] Use static cudart by default (#22397)

Issue: https://github.com/rapidsai/build-planning/issues/235

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/22397
---
 .agents/skills/build-test-cudf-java/SKILL.md    |  2 +-
 conda/recipes/cudf/recipe.yaml                  |  1 -
 cpp/CMakeLists.txt                              | 16 ++++++----------
 cpp/cmake/Modules/JitifyPreprocessKernels.cmake |  2 +-
 cpp/tests/CMakeLists.txt                        |  2 +-
 java/README.md                                  |  7 -------
 java/ci/build-in-docker.sh                      |  6 +-----
 java/pom.xml                                    |  2 --
 java/src/main/native/CMakeLists.txt             | 10 +---------
 python/libcudf/CMakeLists.txt                   |  4 +---
 10 files changed, 12 insertions(+), 40 deletions(-)

diff --git a/.agents/skills/build-test-cudf-java/SKILL.md b/.agents/skills/build-test-cudf-java/SKILL.md
index 6284a5e4230..ca9eb575c37 100644
--- a/.agents/skills/build-test-cudf-java/SKILL.md
+++ b/.agents/skills/build-test-cudf-java/SKILL.md
@@ -51,7 +51,7 @@ export MAVEN_OPTS="--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.
 Export `MVN_COMMON_OPTS` to match the CI build configuration in `java/ci/build-in-docker.sh`. For example:
 
 ```bash
-export MVN_COMMON_OPTS="-DCUDF_CPP_BUILD_DIR=$CUDF_CPP_BUILD_DIR -DBUILD_SHARED_LIBS=OFF -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=ON -DCUDA_STATIC_CUFILE=ON -DCUDA_STATIC_RUNTIME=ON -DCUDF_JNI_LIBCUDF_STATIC=ON"
+export MVN_COMMON_OPTS="-DCUDF_CPP_BUILD_DIR=$CUDF_CPP_BUILD_DIR -DBUILD_SHARED_LIBS=OFF -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=ON -DCUDA_STATIC_CUFILE=ON -DCUDF_JNI_LIBCUDF_STATIC=ON"
 ```
 
 ## Building cudf-java
diff --git a/conda/recipes/cudf/recipe.yaml b/conda/recipes/cudf/recipe.yaml
index f0f7768b1cd..f8516466a58 100644
--- a/conda/recipes/cudf/recipe.yaml
+++ b/conda/recipes/cudf/recipe.yaml
@@ -98,7 +98,6 @@ requirements:
     - pylibcudf =${{ version }}
     - ${{ pin_compatible("rmm", upper_bound="x.x") }}
     - fsspec >=0.6.0
-    - cuda-cudart
     - if: cuda_major == "12"
       then: cuda-python >=12.9.2,<13.0
       else: cuda-python >=13.0.1,<14.0
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index c2485171c71..6d684af8d99 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -58,9 +58,6 @@ option(CUDA_ENABLE_LINEINFO
 )
 option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
 
-# cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
-option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
-
 set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON)
 
 if(NOT BUILD_SHARED_LIBS)
@@ -97,7 +94,6 @@ message(
   VERBOSE
   "CUDF: Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler): ${CUDA_ENABLE_LINEINFO}"
 )
-message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 message(VERBOSE
         "CUDF: Build with remote IO (e.g. AWS S3) support through KvikIO: ${CUDF_KVIKIO_REMOTE_IO}"
 )
@@ -1012,7 +1008,7 @@ if(TARGET conda_env)
   target_link_libraries(cudf PRIVATE conda_env)
 endif()
 
-rapids_cuda_set_runtime(cudf USE_STATIC ${CUDA_STATIC_RUNTIME})
+rapids_cuda_set_runtime(cudf USE_STATIC ON)
 
 file(
   WRITE "${CUDF_BINARY_DIR}/fatbin.ld"
@@ -1059,7 +1055,7 @@ if(CUDF_BUILD_TESTUTIL)
     PUBLIC cudf
     PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
   )
-  rapids_cuda_set_runtime(cudftest_default_stream USE_STATIC ${CUDA_STATIC_RUNTIME})
+  rapids_cuda_set_runtime(cudftest_default_stream USE_STATIC ON)
 
   add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream)
 
@@ -1090,7 +1086,7 @@ if(CUDF_BUILD_TESTUTIL)
     cudftestutil INTERFACE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
                            "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
   )
-  rapids_cuda_set_runtime(cudftestutil USE_STATIC ${CUDA_STATIC_RUNTIME})
+  rapids_cuda_set_runtime(cudftestutil USE_STATIC ON)
   add_library(cudf::cudftestutil ALIAS cudftestutil)
 
   add_library(cudftestutil_impl INTERFACE)
@@ -1151,7 +1147,7 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
     endif()
 
     set(sanitizer_relative_genex
-        "$<PATH:RELATIVE_PATH,$<TARGET_FILE_DIR:CUDA::sanitizer>,$<TARGET_FILE_DIR:CUDA::cudart>>"
+        "$<PATH:RELATIVE_PATH,$<TARGET_FILE_DIR:CUDA::sanitizer>,$<TARGET_FILE_DIR:CUDA::cudart_static>>"
     )
     set_target_properties(
       ${_tgt}
@@ -1166,9 +1162,9 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
       ${_tgt} PRIVATE "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
     )
     target_include_directories(${_tgt} PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/include>")
-    target_link_libraries(${_tgt} PUBLIC CUDA::cudart rmm::rmm CUDA::sanitizer)
+    target_link_libraries(${_tgt} PUBLIC CUDA::cudart_static rmm::rmm CUDA::sanitizer)
 
-    rapids_cuda_set_runtime(${_tgt} USE_STATIC ${CUDA_STATIC_RUNTIME})
+    rapids_cuda_set_runtime(${_tgt} USE_STATIC ON)
     add_library(cudf::${_tgt} ALIAS ${_tgt})
 
     if("${_mode}" STREQUAL "testing")
diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index 10ba33eb397..d035e1ea6ab 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -9,7 +9,7 @@
 add_executable(jitify_preprocess "${JITIFY_INCLUDE_DIR}/jitify2_preprocess.cpp")
 
 target_compile_definitions(jitify_preprocess PRIVATE "_FILE_OFFSET_BITS=64")
-rapids_cuda_set_runtime(jitify_preprocess USE_STATIC ${CUDA_STATIC_RUNTIME})
+rapids_cuda_set_runtime(jitify_preprocess USE_STATIC ON)
 target_link_libraries(jitify_preprocess PUBLIC ${CMAKE_DL_LIBS})
 
 # Take a list of files to JIT-compile and run them through jitify_preprocess.
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 68cde65c57b..a45b7280127 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -61,7 +61,7 @@ function(ConfigureTest CMAKE_TEST_NAME)
     ${CMAKE_TEST_NAME} PRIVATE cudf::cudftestutil_objects nvtx3::nvtx3-cpp
                                $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIBS}"
   )
-  rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
+  rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ON)
   rapids_test_add(
     NAME ${CMAKE_TEST_NAME}
     COMMAND ${CMAKE_TEST_NAME}
diff --git a/java/README.md b/java/README.md
index 7b33f303cf3..e1552712587 100644
--- a/java/README.md
+++ b/java/README.md
@@ -79,13 +79,6 @@ If you decide to build without Docker and the build script, examining the cmake
 settings in the [Java CI build script](ci/build-in-docker.sh) can be helpful if you are
 encountering difficulties during the build.
 
-## Statically Linking the CUDA Runtime
-
-If you use the default cmake options libcudart will be dynamically linked to libcudf and libcudfjni.
-To build with a static CUDA runtime, build libcudf with the `-DCUDA_STATIC_RUNTIME=ON` as a cmake
-parameter, and similarly build with `-DCUDA_STATIC_RUNTIME=ON` when building the Java bindings
-with Maven.
-
 ### Building with a libcudf Archive
 
 When statically linking the CUDA runtime, it is recommended to build cuDF as an archive rather than
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 66140f387fd..e15536c8b6b 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #
-# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. All rights reserved.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
 
@@ -10,7 +10,6 @@ gcc --version
 
 SKIP_JAVA_TESTS=${SKIP_JAVA_TESTS:-true}
 BUILD_CPP_TESTS=${BUILD_CPP_TESTS:-OFF}
-ENABLE_CUDA_STATIC_RUNTIME=${ENABLE_CUDA_STATIC_RUNTIME:-ON}
 ENABLE_PTDS=${ENABLE_PTDS:-ON}
 RMM_LOGGING_LEVEL=${RMM_LOGGING_LEVEL:-OFF}
 ENABLE_NVTX=${ENABLE_NVTX:-ON}
@@ -27,7 +26,6 @@ OUT_PATH="$WORKSPACE/$OUT"
 echo "SIGN_FILE: $SIGN_FILE,\
  SKIP_JAVA_TESTS: $SKIP_JAVA_TESTS,\
  BUILD_CPP_TESTS: $BUILD_CPP_TESTS,\
- ENABLE_CUDA_STATIC_RUNTIME: $ENABLE_CUDA_STATIC_RUNTIME,\
  ENABLED_PTDS: $ENABLE_PTDS,\
  ENABLE_NVTX: $ENABLE_NVTX,\
  ENABLE_GDS: $ENABLE_GDS,\
@@ -47,7 +45,6 @@ mkdir -p "$LIBCUDF_BUILD_PATH"
 cd "$LIBCUDF_BUILD_PATH"
 cmake .. -G"${CMAKE_GENERATOR}" \
          -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
-         -DCUDA_STATIC_RUNTIME="$ENABLE_CUDA_STATIC_RUNTIME" \
          -DUSE_NVTX="$ENABLE_NVTX" \
          -DCUDF_LARGE_STRINGS_DISABLED=ON \
          -DCUDF_USE_ARROW_STATIC=ON \
@@ -70,7 +67,6 @@ BUILD_ARG=(
   "-Dmaven.repo.local=$WORKSPACE/.m2"
   "-DskipTests=$SKIP_JAVA_TESTS"
   "-DCUDF_USE_PER_THREAD_DEFAULT_STREAM=$ENABLE_PTDS"
-  "-DCUDA_STATIC_RUNTIME=$ENABLE_CUDA_STATIC_RUNTIME"
   "-DCUDF_JNI_LIBCUDF_STATIC=ON"
   "-DUSE_GDS=$ENABLE_GDS"
   "-Dtest=*,!CuFileTest,!CudaFatalTest,!ColumnViewNonEmptyNullsTest"
diff --git a/java/pom.xml b/java/pom.xml
index 5df61ec4352..12af51eba71 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -153,7 +153,6 @@
         <skipNativeCopy>false</skipNativeCopy>
         <cxx.flags/>
         <CMAKE_EXPORT_COMPILE_COMMANDS>OFF</CMAKE_EXPORT_COMPILE_COMMANDS>
-        <CUDA_STATIC_RUNTIME>OFF</CUDA_STATIC_RUNTIME>
         <CUDF_USE_PER_THREAD_DEFAULT_STREAM>OFF</CUDF_USE_PER_THREAD_DEFAULT_STREAM>
         <USE_GDS>OFF</USE_GDS>
         <CMAKE_CUDA_ARCHITECTURES>RAPIDS</CMAKE_CUDA_ARCHITECTURES>
@@ -484,7 +483,6 @@
                                     <env key="CUDF_CPP_BUILD_DIR" value="${CUDF_CPP_BUILD_DIR}"/>
                                     <arg value="${basedir}/src/main/native"/>
                                     <arg line="${cmake.ccache.opts}"/>
-                                    <arg value="-DCUDA_STATIC_RUNTIME=${CUDA_STATIC_RUNTIME}" />
                                     <arg value="-DCUDF_USE_PER_THREAD_DEFAULT_STREAM=${CUDF_USE_PER_THREAD_DEFAULT_STREAM}" />
                                     <arg value="-DUSE_GDS=${USE_GDS}" />
                                     <arg value="-DCMAKE_CXX_FLAGS=${cxx.flags}"/>
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 1e7df3802b9..208bc4035c9 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -29,7 +29,6 @@ option(USE_NVTX "Build with NVTX support" ON)
 option(BUILD_SHARED_LIBS "Build cuDF JNI shared libraries" ON)
 option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(CUDF_USE_PER_THREAD_DEFAULT_STREAM "Build with per-thread default stream" OFF)
-option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
 option(USE_GDS "Build with GPUDirect Storage (GDS)/cuFile support" OFF)
 option(CUDF_JNI_LIBCUDF_STATIC "Link with libcudf.a" OFF)
 option(CUDF_JNI_ENABLE_PROFILING "Build with profiling support" ON)
@@ -41,7 +40,6 @@ message(VERBOSE "CUDF_JNI: Configure CMake to build tests: ${BUILD_TESTS}")
 message(VERBOSE
         "CUDF_JNI: Build with per-thread default stream: ${CUDF_USE_PER_THREAD_DEFAULT_STREAM}"
 )
-message(VERBOSE "CUDF_JNI: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}")
 message(VERBOSE "CUDF_JNI: Build with GPUDirect Storage support: ${USE_GDS}")
 message(VERBOSE "CUDF_JNI: Link with libcudf statically: ${CUDF_JNI_LIBCUDF_STATIC}")
 
@@ -279,13 +277,7 @@ target_link_libraries(
 #   cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic
 #   linking
 
-if(CUDA_STATIC_RUNTIME)
-  # Tell CMake what CUDA language runtime to use
-  set_target_properties(cudfjni PROPERTIES CUDA_RUNTIME_LIBRARY Static)
-else()
-  # Tell CMake what CUDA language runtime to use
-  set_target_properties(cudfjni PROPERTIES CUDA_RUNTIME_LIBRARY Shared)
-endif()
+set_target_properties(cudfjni PROPERTIES CUDA_RUNTIME_LIBRARY Static)
 
 # ##################################################################################################
 # * install shared libraries ----------------------------------------------------------------------
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 6feea8e8ba6..7f5176048ad 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -1,6 +1,6 @@
 # =============================================================================
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 # =============================================================================
@@ -63,8 +63,6 @@ if(NOT USE_NVCOMP_RUNTIME_WHEEL)
   endif()
 endif()
 
-set(CUDA_STATIC_RUNTIME ON)
-
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 
 add_subdirectory(../../cpp cudf-cpp)

From 6598b6399c017dc9cc5e892cfdccfca79ae6277d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 6 May 2026 17:33:21 -0500
Subject: [PATCH 27/36] Fix `to_array` to return non-corrupted data (#22342)

Fixes #22136

This PR gueared the homogeneous numeric `DataFrame.to_cupy` fast path  so it only uses `table_to_array` when `dtype` is `None` or exactly matches the source column `dtype`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - https://github.com/apps/pre-commit-ci

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/22342
---
 python/cudf/cudf/core/frame.py                     |  1 +
 .../cudf/tests/dataframe/methods/test_to_cupy.py   | 14 +++++++++++++-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 8c933649af2..fc9cd5b2cd3 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -842,6 +842,7 @@ def to_cupy(
             self._num_columns > 1
             and na_value is None
             and self._columns[0].dtype.kind in {"i", "u", "f", "b"}
+            and (dtype is None or dtype == self._columns[0].dtype)
             and all(
                 not col.nullable and col.dtype == self._columns[0].dtype
                 for col in self._columns
diff --git a/python/cudf/cudf/tests/dataframe/methods/test_to_cupy.py b/python/cudf/cudf/tests/dataframe/methods/test_to_cupy.py
index 44ee7a4278d..3eb69e0e928 100644
--- a/python/cudf/cudf/tests/dataframe/methods/test_to_cupy.py
+++ b/python/cudf/cudf/tests/dataframe/methods/test_to_cupy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 import cupy as cp
@@ -64,6 +64,18 @@ def test_dataframe_to_cupy():
         np.testing.assert_array_equal(df[k].to_numpy(), mat[:, i])
 
 
+@pytest.mark.parametrize("in_dtype", ["int32", "int64", "float32", "float64"])
+@pytest.mark.parametrize("out_dtype", ["int32", "int64", "float32", "float64"])
+def test_dataframe_to_cupy_dtype(in_dtype, out_dtype):
+    data = np.arange(12, dtype=in_dtype).reshape(3, 4)
+    df = cudf.DataFrame(data)
+
+    result = df.to_cupy(dtype=out_dtype)
+
+    assert result.dtype == np.dtype(out_dtype)
+    np.testing.assert_allclose(result.get(), data.astype(out_dtype))
+
+
 @pytest.mark.parametrize("has_nulls", [False, True])
 @pytest.mark.parametrize("use_na_value", [False, True])
 def test_dataframe_to_cupy_single_column(has_nulls, use_na_value):

From aa0a7070655a98701281c2ce5e01b84e747fdafa Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 6 May 2026 16:21:33 -0700
Subject: [PATCH 28/36] Use cudaStream_t instead of cuda_stream_view in
 pylibcudf Cython (#22368)

Contributes to https://github.com/rapidsai/rmm/issues/2359

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/22368
---
 .../cudf_polars/utils/cuda_stream.py          |   5 +-
 python/pylibcudf/pylibcudf/binaryop.pxd       |   5 +-
 python/pylibcudf/pylibcudf/binaryop.pyi       |   6 +-
 python/pylibcudf/pylibcudf/binaryop.pyx       |  16 +-
 python/pylibcudf/pylibcudf/column.pxd         |  19 +-
 python/pylibcudf/pylibcudf/column.pyi         |  32 ++-
 python/pylibcudf/pylibcudf/column.pyx         | 118 ++++----
 .../pylibcudf/pylibcudf/column_factories.pxd  |  15 +-
 .../pylibcudf/pylibcudf/column_factories.pyi  |  16 +-
 .../pylibcudf/pylibcudf/column_factories.pyx  |  58 ++--
 python/pylibcudf/pylibcudf/concatenate.pxd    |   5 +-
 python/pylibcudf/pylibcudf/concatenate.pyi    |   6 +-
 python/pylibcudf/pylibcudf/concatenate.pyx    |  16 +-
 .../pylibcudf/pylibcudf/contiguous_split.pxd  |   8 +-
 .../pylibcudf/pylibcudf/contiguous_split.pyi  |  14 +-
 .../pylibcudf/pylibcudf/contiguous_split.pyx  |  46 ++--
 python/pylibcudf/pylibcudf/copying.pxd        |  27 +-
 python/pylibcudf/pylibcudf/copying.pyi        |  32 ++-
 python/pylibcudf/pylibcudf/copying.pyx        | 116 ++++----
 python/pylibcudf/pylibcudf/datetime.pxd       |  23 +-
 python/pylibcudf/pylibcudf/datetime.pyi       |  24 +-
 python/pylibcudf/pylibcudf/datetime.pyx       |  93 ++++---
 .../pylibcudf/experimental/_join_streams.pxd  |   5 +-
 .../pylibcudf/experimental/_join_streams.pyi  |   8 +-
 .../pylibcudf/experimental/_join_streams.pyx  |  20 +-
 python/pylibcudf/pylibcudf/filling.pxd        |  13 +-
 python/pylibcudf/pylibcudf/filling.pyi        |  17 +-
 python/pylibcudf/pylibcudf/filling.pyx        |  48 ++--
 python/pylibcudf/pylibcudf/groupby.pxd        |  15 +-
 python/pylibcudf/pylibcudf/groupby.pyi        |  14 +-
 python/pylibcudf/pylibcudf/groupby.pyx        |  68 +++--
 python/pylibcudf/pylibcudf/hashing.pxd        |  23 +-
 python/pylibcudf/pylibcudf/hashing.pyi        |  24 +-
 python/pylibcudf/pylibcudf/hashing.pyx        |  93 ++++---
 python/pylibcudf/pylibcudf/interop.pxd        |   7 +-
 python/pylibcudf/pylibcudf/interop.pyi        |  12 +-
 python/pylibcudf/pylibcudf/interop.pyx        |  17 +-
 python/pylibcudf/pylibcudf/io/avro.pxd        |   5 +-
 python/pylibcudf/pylibcudf/io/avro.pyi        |   6 +-
 python/pylibcudf/pylibcudf/io/avro.pyx        |   8 +-
 python/pylibcudf/pylibcudf/io/csv.pxd         |   7 +-
 python/pylibcudf/pylibcudf/io/csv.pyi         |   8 +-
 python/pylibcudf/pylibcudf/io/csv.pyx         |  13 +-
 .../pylibcudf/io/experimental/hybrid_scan.pxd |   2 +-
 .../pylibcudf/io/experimental/hybrid_scan.pyi |  20 +-
 .../pylibcudf/io/experimental/hybrid_scan.pyx |  66 ++---
 python/pylibcudf/pylibcudf/io/json.pxd        |  11 +-
 python/pylibcudf/pylibcudf/io/json.pyi        |  10 +-
 python/pylibcudf/pylibcudf/io/json.pyx        |  31 ++-
 python/pylibcudf/pylibcudf/io/orc.pxd         |   9 +-
 python/pylibcudf/pylibcudf/io/orc.pyi         |  10 +-
 python/pylibcudf/pylibcudf/io/orc.pyx         |  24 +-
 python/pylibcudf/pylibcudf/io/parquet.pxd     |   8 +-
 python/pylibcudf/pylibcudf/io/parquet.pyi     |  11 +-
 python/pylibcudf/pylibcudf/io/parquet.pyx     |  26 +-
 python/pylibcudf/pylibcudf/io/text.pxd        |   5 +-
 python/pylibcudf/pylibcudf/io/text.pyi        |   6 +-
 python/pylibcudf/pylibcudf/io/text.pyx        |  14 +-
 python/pylibcudf/pylibcudf/io/timezone.pxd    |   6 +-
 python/pylibcudf/pylibcudf/io/timezone.pyi    |   6 +-
 python/pylibcudf/pylibcudf/io/timezone.pyx    |  14 +-
 python/pylibcudf/pylibcudf/io/types.pxd       |   5 +-
 python/pylibcudf/pylibcudf/io/types.pyx       |   3 +-
 python/pylibcudf/pylibcudf/join.pxd           |  35 ++-
 python/pylibcudf/pylibcudf/join.pyi           |  40 +--
 python/pylibcudf/pylibcudf/join.pyx           | 211 ++++++++------
 python/pylibcudf/pylibcudf/json.pxd           |   5 +-
 python/pylibcudf/pylibcudf/json.pyi           |   6 +-
 python/pylibcudf/pylibcudf/json.pyx           |  12 +-
 python/pylibcudf/pylibcudf/labeling.pxd       |   5 +-
 python/pylibcudf/pylibcudf/labeling.pyi       |   6 +-
 python/pylibcudf/pylibcudf/labeling.pyx       |  12 +-
 .../pylibcudf/pylibcudf/libcudf/binaryop.pxd  |  10 +-
 .../pylibcudf/libcudf/column/column.pxd       |   6 +-
 .../libcudf/column/column_factories.pxd       |  28 +-
 .../pylibcudf/libcudf/concatenate.pxd         |   6 +-
 .../pylibcudf/libcudf/contiguous_split.pxd    |   8 +-
 .../pylibcudf/pylibcudf/libcudf/copying.pxd   |  40 +--
 .../pylibcudf/pylibcudf/libcudf/datetime.pxd  |  24 +-
 .../libcudf/detail/utilities/stream_pool.pxd  |  29 +-
 .../pylibcudf/libcudf/distinct_count.pxd      |   6 +-
 .../pylibcudf/pylibcudf/libcudf/filling.pxd   |  14 +-
 .../pylibcudf/pylibcudf/libcudf/groupby.pxd   |  12 +-
 python/pylibcudf/pylibcudf/libcudf/hash.pxd   |  22 +-
 .../pylibcudf/pylibcudf/libcudf/interop.pxd   |  24 +-
 .../pylibcudf/pylibcudf/libcudf/io/avro.pxd   |   4 +-
 python/pylibcudf/pylibcudf/libcudf/io/csv.pxd |   6 +-
 .../pylibcudf/libcudf/io/hybrid_scan.pxd      |  20 +-
 .../pylibcudf/pylibcudf/libcudf/io/json.pxd   |   6 +-
 python/pylibcudf/pylibcudf/libcudf/io/orc.pxd |   8 +-
 .../pylibcudf/libcudf/io/orc_metadata.pxd     |   6 +-
 .../pylibcudf/libcudf/io/parquet.pxd          |  12 +-
 .../pylibcudf/pylibcudf/libcudf/io/text.pxd   |   4 +-
 .../pylibcudf/libcudf/io/timezone.pxd         |   4 +-
 python/pylibcudf/pylibcudf/libcudf/join.pxd   |  52 ++--
 python/pylibcudf/pylibcudf/libcudf/json.pxd   |   4 +-
 .../pylibcudf/pylibcudf/libcudf/labeling.pxd  |   4 +-
 .../pylibcudf/libcudf/lists/combine.pxd       |   8 +-
 .../pylibcudf/libcudf/lists/contains.pxd      |  12 +-
 .../libcudf/lists/count_elements.pxd          |   4 +-
 .../pylibcudf/libcudf/lists/explode.pxd       |   4 +-
 .../pylibcudf/libcudf/lists/extract.pxd       |   6 +-
 .../pylibcudf/libcudf/lists/filling.pxd       |   6 +-
 .../pylibcudf/libcudf/lists/gather.pxd        |   4 +-
 .../libcudf/lists/lists_column_view.pxd       |   4 +-
 .../pylibcudf/libcudf/lists/reverse.pxd       |   4 +-
 .../libcudf/lists/set_operations.pxd          |  10 +-
 .../pylibcudf/libcudf/lists/sorting.pxd       |   6 +-
 .../libcudf/lists/stream_compaction.pxd       |   6 +-
 python/pylibcudf/pylibcudf/libcudf/merge.pxd  |   4 +-
 .../pylibcudf/pylibcudf/libcudf/null_mask.pxd |  16 +-
 .../libcudf/nvtext/byte_pair_encode.pxd       |   6 +-
 .../pylibcudf/libcudf/nvtext/deduplicate.pxd  |   8 +-
 .../libcudf/nvtext/edit_distance.pxd          |   6 +-
 .../libcudf/nvtext/generate_ngrams.pxd        |   8 +-
 .../pylibcudf/libcudf/nvtext/jaccard.pxd      |   4 +-
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  10 +-
 .../libcudf/nvtext/ngrams_tokenize.pxd        |   4 +-
 .../pylibcudf/libcudf/nvtext/normalize.pxd    |   8 +-
 .../pylibcudf/libcudf/nvtext/replace.pxd      |   6 +-
 .../pylibcudf/libcudf/nvtext/stemmer.pxd      |   8 +-
 .../pylibcudf/libcudf/nvtext/tokenize.pxd     |  18 +-
 .../libcudf/nvtext/wordpiece_tokenize.pxd     |   6 +-
 .../pylibcudf/libcudf/partitioning.pxd        |  10 +-
 .../pylibcudf/pylibcudf/libcudf/quantiles.pxd |   6 +-
 python/pylibcudf/pylibcudf/libcudf/reduce.pxd |   8 +-
 .../pylibcudf/pylibcudf/libcudf/replace.pxd   |  18 +-
 .../pylibcudf/pylibcudf/libcudf/reshape.pxd   |   8 +-
 .../pylibcudf/pylibcudf/libcudf/rolling.pxd   |  10 +-
 python/pylibcudf/pylibcudf/libcudf/round.pxd  |   6 +-
 .../pylibcudf/libcudf/scalar/scalar.pxd       |  18 +-
 .../libcudf/scalar/scalar_factories.pxd       |  18 +-
 python/pylibcudf/pylibcudf/libcudf/search.pxd |   8 +-
 .../pylibcudf/pylibcudf/libcudf/sorting.pxd   |  26 +-
 .../pylibcudf/libcudf/stream_compaction.pxd   |  18 +-
 .../pylibcudf/libcudf/strings/attributes.pxd  |   8 +-
 .../pylibcudf/libcudf/strings/capitalize.pxd  |   8 +-
 .../pylibcudf/libcudf/strings/case.pxd        |   8 +-
 .../pylibcudf/libcudf/strings/char_types.pxd  |   6 +-
 .../pylibcudf/libcudf/strings/combine.pxd     |  12 +-
 .../pylibcudf/libcudf/strings/contains.pxd    |  12 +-
 .../strings/convert/convert_booleans.pxd      |   6 +-
 .../strings/convert/convert_datetime.pxd      |   8 +-
 .../strings/convert/convert_durations.pxd     |   6 +-
 .../strings/convert/convert_fixed_point.pxd   |   8 +-
 .../strings/convert/convert_floats.pxd        |   8 +-
 .../strings/convert/convert_integers.pxd      |  16 +-
 .../libcudf/strings/convert/convert_ipv4.pxd  |   8 +-
 .../libcudf/strings/convert/convert_lists.pxd |   4 +-
 .../libcudf/strings/convert/convert_urls.pxd  |   6 +-
 .../pylibcudf/libcudf/strings/extract.pxd     |   8 +-
 .../pylibcudf/libcudf/strings/find.pxd        |  20 +-
 .../libcudf/strings/find_multiple.pxd         |   6 +-
 .../pylibcudf/libcudf/strings/findall.pxd     |   6 +-
 .../pylibcudf/libcudf/strings/padding.pxd     |   8 +-
 .../pylibcudf/libcudf/strings/repeat.pxd      |   6 +-
 .../pylibcudf/libcudf/strings/replace.pxd     |   8 +-
 .../pylibcudf/libcudf/strings/replace_re.pxd  |   8 +-
 .../pylibcudf/libcudf/strings/reverse.pxd     |   4 +-
 .../libcudf/strings/split/partition.pxd       |   6 +-
 .../pylibcudf/libcudf/strings/split/split.pxd |  20 +-
 .../libcudf/strings/strings_column_view.pxd   |   6 +-
 .../pylibcudf/libcudf/strings/strip.pxd       |   4 +-
 .../pylibcudf/libcudf/strings/substring.pxd   |   6 +-
 .../pylibcudf/libcudf/strings/translate.pxd   |   6 +-
 .../pylibcudf/libcudf/strings/wrap.pxd        |   4 +-
 .../libcudf/structs/structs_column_view.pxd   |   4 +-
 .../pylibcudf/libcudf/table/table.pxd         |   6 +-
 .../pylibcudf/pylibcudf/libcudf/transform.pxd |  20 +-
 .../pylibcudf/pylibcudf/libcudf/transpose.pxd |   4 +-
 python/pylibcudf/pylibcudf/libcudf/unary.pxd  |  14 +-
 .../pylibcudf/libcudf/unique_count.pxd        |   6 +-
 .../libcudf/utilities/default_stream.pxd      |   7 +-
 python/pylibcudf/pylibcudf/lists.pxd          |  39 ++-
 python/pylibcudf/pylibcudf/lists.pyi          |  40 +--
 python/pylibcudf/pylibcudf/lists.pyx          | 167 ++++++-----
 python/pylibcudf/pylibcudf/merge.pxd          |   5 +-
 python/pylibcudf/pylibcudf/merge.pyi          |   6 +-
 python/pylibcudf/pylibcudf/merge.pyx          |  12 +-
 python/pylibcudf/pylibcudf/null_mask.pxd      |  17 +-
 python/pylibcudf/pylibcudf/null_mask.pyi      |  16 +-
 python/pylibcudf/pylibcudf/null_mask.pyx      |  64 +++--
 .../pylibcudf/nvtext/byte_pair_encode.pxd     |   5 +-
 .../pylibcudf/nvtext/byte_pair_encode.pyi     |   8 +-
 .../pylibcudf/nvtext/byte_pair_encode.pyx     |  23 +-
 .../pylibcudf/nvtext/deduplicate.pxd          |   9 +-
 .../pylibcudf/nvtext/deduplicate.pyi          |  10 +-
 .../pylibcudf/nvtext/deduplicate.pyx          |  34 +--
 .../pylibcudf/nvtext/edit_distance.pxd        |   7 +-
 .../pylibcudf/nvtext/edit_distance.pyi        |   8 +-
 .../pylibcudf/nvtext/edit_distance.pyx        |  19 +-
 .../pylibcudf/nvtext/generate_ngrams.pxd      |   9 +-
 .../pylibcudf/nvtext/generate_ngrams.pyi      |  10 +-
 .../pylibcudf/nvtext/generate_ngrams.pyx      |  30 +-
 python/pylibcudf/pylibcudf/nvtext/jaccard.pxd |   5 +-
 python/pylibcudf/pylibcudf/nvtext/jaccard.pyi |   6 +-
 python/pylibcudf/pylibcudf/nvtext/jaccard.pyx |  12 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pxd |  11 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pyi |  12 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx |  39 +--
 .../pylibcudf/nvtext/ngrams_tokenize.pxd      |   5 +-
 .../pylibcudf/nvtext/ngrams_tokenize.pyi      |   6 +-
 .../pylibcudf/nvtext/ngrams_tokenize.pyx      |  12 +-
 .../pylibcudf/pylibcudf/nvtext/normalize.pxd  |   9 +-
 .../pylibcudf/pylibcudf/nvtext/normalize.pyi  |  10 +-
 .../pylibcudf/pylibcudf/nvtext/normalize.pyx  |  28 +-
 python/pylibcudf/pylibcudf/nvtext/replace.pxd |   7 +-
 python/pylibcudf/pylibcudf/nvtext/replace.pyi |   8 +-
 python/pylibcudf/pylibcudf/nvtext/replace.pyx |  27 +-
 python/pylibcudf/pylibcudf/nvtext/stemmer.pxd |   7 +-
 python/pylibcudf/pylibcudf/nvtext/stemmer.pyi |   8 +-
 python/pylibcudf/pylibcudf/nvtext/stemmer.pyx |  21 +-
 .../pylibcudf/pylibcudf/nvtext/tokenize.pxd   |  17 +-
 .../pylibcudf/pylibcudf/nvtext/tokenize.pyi   |  20 +-
 .../pylibcudf/pylibcudf/nvtext/tokenize.pyx   |  79 +++---
 .../pylibcudf/nvtext/wordpiece_tokenize.pxd   |   5 +-
 .../pylibcudf/nvtext/wordpiece_tokenize.pyi   |   8 +-
 .../pylibcudf/nvtext/wordpiece_tokenize.pyx   |  19 +-
 python/pylibcudf/pylibcudf/partitioning.pxd   |   7 +-
 python/pylibcudf/pylibcudf/partitioning.pyi   |   8 +-
 python/pylibcudf/pylibcudf/partitioning.pyx   |  30 +-
 python/pylibcudf/pylibcudf/quantiles.pxd      |   7 +-
 python/pylibcudf/pylibcudf/quantiles.pyi      |   8 +-
 python/pylibcudf/pylibcudf/quantiles.pyx      |  21 +-
 python/pylibcudf/pylibcudf/reduce.pxd         |  11 +-
 python/pylibcudf/pylibcudf/reduce.pyi         |  12 +-
 python/pylibcudf/pylibcudf/reduce.pyx         |  36 +--
 python/pylibcudf/pylibcudf/replace.pxd        |  11 +-
 python/pylibcudf/pylibcudf/replace.pyi        |  12 +-
 python/pylibcudf/pylibcudf/replace.pyx        |  51 ++--
 python/pylibcudf/pylibcudf/reshape.pxd        |   9 +-
 python/pylibcudf/pylibcudf/reshape.pyi        |  10 +-
 python/pylibcudf/pylibcudf/reshape.pyx        |  28 +-
 python/pylibcudf/pylibcudf/rolling.pxd        |   9 +-
 python/pylibcudf/pylibcudf/rolling.pyi        |  10 +-
 python/pylibcudf/pylibcudf/rolling.pyx        |  34 ++-
 python/pylibcudf/pylibcudf/round.pxd          |   5 +-
 python/pylibcudf/pylibcudf/round.pyi          |   6 +-
 python/pylibcudf/pylibcudf/round.pyx          |  19 +-
 python/pylibcudf/pylibcudf/scalar.pxd         |   7 +-
 python/pylibcudf/pylibcudf/scalar.pyi         |  19 +-
 python/pylibcudf/pylibcudf/scalar.pyx         | 259 +++++++++++-------
 python/pylibcudf/pylibcudf/search.pxd         |   9 +-
 python/pylibcudf/pylibcudf/search.pyi         |  10 +-
 python/pylibcudf/pylibcudf/search.pyx         |  30 +-
 python/pylibcudf/pylibcudf/sorting.pxd        |  27 +-
 python/pylibcudf/pylibcudf/sorting.pyi        |  28 +-
 python/pylibcudf/pylibcudf/sorting.pyx        | 109 ++++----
 .../pylibcudf/pylibcudf/stream_compaction.pxd |  17 +-
 .../pylibcudf/pylibcudf/stream_compaction.pyi |  18 +-
 .../pylibcudf/pylibcudf/stream_compaction.pyx |  73 ++---
 .../pylibcudf/strings/attributes.pxd          |   9 +-
 .../pylibcudf/strings/attributes.pyi          |  10 +-
 .../pylibcudf/strings/attributes.pyx          |  30 +-
 .../pylibcudf/strings/capitalize.pxd          |   9 +-
 .../pylibcudf/strings/capitalize.pyi          |  10 +-
 .../pylibcudf/strings/capitalize.pyx          |  32 ++-
 python/pylibcudf/pylibcudf/strings/case.pxd   |   9 +-
 python/pylibcudf/pylibcudf/strings/case.pyi   |  10 +-
 python/pylibcudf/pylibcudf/strings/case.pyx   |  30 +-
 .../pylibcudf/strings/char_types.pxd          |   7 +-
 .../pylibcudf/strings/char_types.pyi          |   8 +-
 .../pylibcudf/strings/char_types.pyx          |  21 +-
 .../pylibcudf/pylibcudf/strings/combine.pxd   |   9 +-
 .../pylibcudf/pylibcudf/strings/combine.pyi   |  10 +-
 .../pylibcudf/pylibcudf/strings/combine.pyx   |  38 +--
 .../pylibcudf/pylibcudf/strings/contains.pxd  |  11 +-
 .../pylibcudf/pylibcudf/strings/contains.pyi  |  12 +-
 .../pylibcudf/pylibcudf/strings/contains.pyx  |  43 +--
 .../strings/convert/convert_booleans.pxd      |   7 +-
 .../strings/convert/convert_booleans.pyi      |   8 +-
 .../strings/convert/convert_booleans.pyx      |  21 +-
 .../strings/convert/convert_datetime.pxd      |   9 +-
 .../strings/convert/convert_datetime.pyi      |  10 +-
 .../strings/convert/convert_datetime.pyx      |  30 +-
 .../strings/convert/convert_durations.pxd     |   7 +-
 .../strings/convert/convert_durations.pyi     |   8 +-
 .../strings/convert/convert_durations.pyx     |  21 +-
 .../strings/convert/convert_fixed_point.pxd   |   9 +-
 .../strings/convert/convert_fixed_point.pyi   |  10 +-
 .../strings/convert/convert_fixed_point.pyx   |  30 +-
 .../strings/convert/convert_floats.pxd        |   9 +-
 .../strings/convert/convert_floats.pyi        |  10 +-
 .../strings/convert/convert_floats.pyx        |  32 ++-
 .../strings/convert/convert_integers.pxd      |  15 +-
 .../strings/convert/convert_integers.pyi      |  16 +-
 .../strings/convert/convert_integers.pyx      |  59 ++--
 .../strings/convert/convert_ipv4.pxd          |   9 +-
 .../strings/convert/convert_ipv4.pyi          |  10 +-
 .../strings/convert/convert_ipv4.pyx          |  30 +-
 .../strings/convert/convert_lists.pxd         |   5 +-
 .../strings/convert/convert_lists.pyi         |   6 +-
 .../strings/convert/convert_lists.pyx         |  14 +-
 .../strings/convert/convert_urls.pxd          |   7 +-
 .../strings/convert/convert_urls.pyi          |   8 +-
 .../strings/convert/convert_urls.pyx          |  25 +-
 .../pylibcudf/pylibcudf/strings/extract.pxd   |   9 +-
 .../pylibcudf/pylibcudf/strings/extract.pyi   |  10 +-
 .../pylibcudf/pylibcudf/strings/extract.pyx   |  30 +-
 python/pylibcudf/pylibcudf/strings/find.pxd   |  13 +-
 python/pylibcudf/pylibcudf/strings/find.pyi   |  14 +-
 python/pylibcudf/pylibcudf/strings/find.pyx   |  56 ++--
 .../pylibcudf/strings/find_multiple.pxd       |   7 +-
 .../pylibcudf/strings/find_multiple.pyi       |   8 +-
 .../pylibcudf/strings/find_multiple.pyx       |  21 +-
 .../pylibcudf/pylibcudf/strings/findall.pxd   |   7 +-
 .../pylibcudf/pylibcudf/strings/findall.pyi   |   8 +-
 .../pylibcudf/pylibcudf/strings/findall.pyx   |  21 +-
 .../pylibcudf/pylibcudf/strings/padding.pxd   |   9 +-
 .../pylibcudf/pylibcudf/strings/padding.pyi   |  10 +-
 .../pylibcudf/pylibcudf/strings/padding.pyx   |  30 +-
 python/pylibcudf/pylibcudf/strings/repeat.pxd |   5 +-
 python/pylibcudf/pylibcudf/strings/repeat.pyi |   6 +-
 python/pylibcudf/pylibcudf/strings/repeat.pyx |  14 +-
 .../pylibcudf/pylibcudf/strings/replace.pxd   |   9 +-
 .../pylibcudf/pylibcudf/strings/replace.pyi   |  10 +-
 .../pylibcudf/pylibcudf/strings/replace.pyx   |  32 ++-
 .../pylibcudf/strings/replace_re.pxd          |   7 +-
 .../pylibcudf/strings/replace_re.pyi          |  10 +-
 .../pylibcudf/strings/replace_re.pyx          |  27 +-
 .../pylibcudf/pylibcudf/strings/reverse.pyi   |   6 +-
 .../pylibcudf/pylibcudf/strings/reverse.pyx   |  12 +-
 python/pylibcudf/pylibcudf/strings/slice.pxd  |   5 +-
 python/pylibcudf/pylibcudf/strings/slice.pyi  |   6 +-
 python/pylibcudf/pylibcudf/strings/slice.pyx  |  20 +-
 .../pylibcudf/strings/split/partition.pxd     |   7 +-
 .../pylibcudf/strings/split/partition.pyi     |   8 +-
 .../pylibcudf/strings/split/partition.pyx     |  25 +-
 .../pylibcudf/strings/split/split.pxd         |  19 +-
 .../pylibcudf/strings/split/split.pyi         |  20 +-
 .../pylibcudf/strings/split/split.pyx         |  82 +++---
 python/pylibcudf/pylibcudf/strings/strip.pxd  |   5 +-
 python/pylibcudf/pylibcudf/strings/strip.pyi  |   6 +-
 python/pylibcudf/pylibcudf/strings/strip.pyx  |  14 +-
 .../pylibcudf/pylibcudf/strings/translate.pxd |   7 +-
 .../pylibcudf/pylibcudf/strings/translate.pyi |   8 +-
 .../pylibcudf/pylibcudf/strings/translate.pyx |  21 +-
 python/pylibcudf/pylibcudf/strings/wrap.pxd   |   5 +-
 python/pylibcudf/pylibcudf/strings/wrap.pyi   |   6 +-
 python/pylibcudf/pylibcudf/strings/wrap.pyx   |  12 +-
 python/pylibcudf/pylibcudf/table.pxd          |   7 +-
 python/pylibcudf/pylibcudf/table.pyi          |  10 +-
 python/pylibcudf/pylibcudf/table.pyx          |  29 +-
 python/pylibcudf/pylibcudf/transform.pxd      |  19 +-
 python/pylibcudf/pylibcudf/transform.pyi      |  20 +-
 python/pylibcudf/pylibcudf/transform.pyx      |  92 ++++---
 python/pylibcudf/pylibcudf/transpose.pxd      |   5 +-
 python/pylibcudf/pylibcudf/transpose.pyi      |   6 +-
 python/pylibcudf/pylibcudf/transpose.pyx      |  12 +-
 python/pylibcudf/pylibcudf/unary.pxd          |  15 +-
 python/pylibcudf/pylibcudf/unary.pyi          |  16 +-
 python/pylibcudf/pylibcudf/unary.pyx          |  59 ++--
 python/pylibcudf/pylibcudf/utils.pxd          |   6 +-
 python/pylibcudf/pylibcudf/utils.pyi          |   9 +-
 python/pylibcudf/pylibcudf/utils.pyx          |   6 +-
 python/pylibcudf/tests/test_experimental.py   |  23 +-
 .../pylibcudf/tests/test_stream_protocol.py   |  74 +++++
 357 files changed, 3470 insertions(+), 2967 deletions(-)
 create mode 100644 python/pylibcudf/tests/test_stream_protocol.py

diff --git a/python/cudf_polars/cudf_polars/utils/cuda_stream.py b/python/cudf_polars/cudf_polars/utils/cuda_stream.py
index a42252157b4..c0708d3bea8 100644
--- a/python/cudf_polars/cudf_polars/utils/cuda_stream.py
+++ b/python/cudf_polars/cudf_polars/utils/cuda_stream.py
@@ -13,6 +13,7 @@
 if TYPE_CHECKING:
     from collections.abc import Callable, Sequence
 
+    from pylibcudf.utils import CudaStreamLike
     from rmm.pylibrmm.stream import Stream
 
 
@@ -27,7 +28,7 @@ def get_cuda_stream() -> Stream:
 
 
 def join_cuda_streams(
-    *, downstreams: Sequence[Stream], upstreams: Sequence[Stream]
+    *, downstreams: Sequence[CudaStreamLike], upstreams: Sequence[CudaStreamLike]
 ) -> None:
     """
     Join multiple CUDA streams.
@@ -46,7 +47,7 @@ def join_cuda_streams(
 
 
 def get_joined_cuda_stream(
-    get_cuda_stream: Callable[[], Stream], *, upstreams: Sequence[Stream]
+    get_cuda_stream: Callable[[], Stream], *, upstreams: Sequence[CudaStreamLike]
 ) -> Stream:
     """
     Return a CUDA stream that is joined to the given streams.
diff --git a/python/pylibcudf/pylibcudf/binaryop.pxd b/python/pylibcudf/pylibcudf/binaryop.pxd
index 29c9f3d98ea..a34a02b2191 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pxd
+++ b/python/pylibcudf/pylibcudf/binaryop.pxd
@@ -1,10 +1,9 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
 from pylibcudf.libcudf.binaryop cimport binary_operator
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 from .column cimport Column
 from .scalar cimport Scalar
@@ -25,7 +24,7 @@ cpdef Column binary_operation(
     RightBinaryOperand rhs,
     binary_operator op,
     DataType output_type,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyi b/python/pylibcudf/pylibcudf/binaryop.pyi
index 52263440db3..1f3c9a2cb64 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyi
+++ b/python/pylibcudf/pylibcudf/binaryop.pyi
@@ -1,14 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 class BinaryOperator(IntEnum):
     ADD = ...
@@ -52,7 +52,7 @@ def binary_operation(
     rhs: Column | Scalar,
     op: BinaryOperator,
     output_type: DataType,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_supported_operation(
diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
index a46b6aaaa81..20a69d60727 100644
--- a/python/pylibcudf/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator import dereference
@@ -20,6 +20,7 @@ from .column cimport Column
 from .scalar cimport Scalar
 from .types cimport DataType
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["BinaryOperator", "binary_operation", "is_supported_operation"]
 
@@ -28,7 +29,7 @@ cpdef Column binary_operation(
     RightBinaryOperand rhs,
     binary_operator op,
     DataType output_type,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a binary operation between a column and another column or scalar.
@@ -61,7 +62,8 @@ cpdef Column binary_operation(
         The result of the binary operation
     """
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if LeftBinaryOperand is Column and RightBinaryOperand is Column:
@@ -71,7 +73,7 @@ cpdef Column binary_operation(
                 rhs.view(),
                 op,
                 output_type.c_obj,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     elif LeftBinaryOperand is Column and RightBinaryOperand is Scalar:
@@ -81,7 +83,7 @@ cpdef Column binary_operation(
                 dereference(rhs.c_obj),
                 op,
                 output_type.c_obj,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     elif LeftBinaryOperand is Scalar and RightBinaryOperand is Column:
@@ -91,13 +93,13 @@ cpdef Column binary_operation(
                 rhs.view(),
                 op,
                 output_type.c_obj,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
         raise ValueError(f"Invalid arguments {lhs} and {rhs}")
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef bool is_supported_operation(
diff --git a/python/pylibcudf/pylibcudf/column.pxd b/python/pylibcudf/pylibcudf/column.pxd
index 7348d68f6de..429f85f39b0 100644
--- a/python/pylibcudf/pylibcudf/column.pxd
+++ b/python/pylibcudf/pylibcudf/column.pxd
@@ -6,7 +6,6 @@ from libcpp.vector cimport vector
 from libc.stdint cimport uint64_t
 
 from rmm.librmm.device_buffer cimport device_buffer
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport (
@@ -27,7 +26,7 @@ cdef class OwnerWithCAI:
     cdef dict cai
 
     @staticmethod
-    cdef create(column_view cv, object owner, Stream stream)
+    cdef create(column_view cv, object owner, object stream)
 
 
 cdef class OwnerMaskWithCAI:
@@ -38,7 +37,7 @@ cdef class OwnerMaskWithCAI:
     cdef create(column_view cv, object owner)
 
 
-cdef gpumemoryview _copy_array_to_device(object buf, Stream stream=*)
+cdef gpumemoryview _copy_array_to_device(object buf, object stream=*)
 
 
 cdef class Column:
@@ -61,7 +60,7 @@ cdef class Column:
     @staticmethod
     cdef Column from_libcudf(
         unique_ptr[column] libcudf_col,
-        Stream stream,
+        object stream,
         DeviceMemoryResource mr
     )
 
@@ -72,7 +71,7 @@ cdef class Column:
     cdef Column from_column_view_of_arbitrary(
         const column_view& cv,
         object owner,
-        Stream stream,
+        object stream,
     )
 
     @staticmethod
@@ -81,10 +80,10 @@ cdef class Column:
         tuple shape,
         DataType dtype,
         Column base=*,
-        Stream stream=*,
+        object stream=*,
     )
 
-    cpdef Scalar to_scalar(self, Stream stream=*, DeviceMemoryResource mr=*)
+    cpdef Scalar to_scalar(self, object stream=*, DeviceMemoryResource mr=*)
     cpdef DataType type(self)
     cpdef Column child(self, size_type index)
     cpdef size_type num_children(self)
@@ -95,7 +94,7 @@ cdef class Column:
     cpdef object data(self)
     cpdef object null_mask(self)
     cpdef list children(self)
-    cpdef Column copy(self, Stream stream=*, DeviceMemoryResource mr=*)
+    cpdef Column copy(self, object stream=*, DeviceMemoryResource mr=*)
     cpdef uint64_t device_buffer_size(self)
     cpdef Column with_mask(self, object, size_type, bint validate=*)
 
@@ -108,10 +107,10 @@ cdef class ListsColumnView:
     cpdef child(self)
     cpdef offsets(self)
     cdef lists_column_view view(self) nogil
-    cpdef Column get_sliced_child(self, Stream stream=*)
+    cpdef Column get_sliced_child(self, object stream=*)
 
 
 cdef class StructsColumnView:
     cdef Column _column
     cdef structs_column_view view(self) nogil
-    cpdef Column get_sliced_child(self, int index, Stream stream=*)
+    cpdef Column get_sliced_child(self, int index, object stream=*)
diff --git a/python/pylibcudf/pylibcudf/column.pyi b/python/pylibcudf/pylibcudf/column.pyi
index 3ac4641ac13..3ff7f53f356 100644
--- a/python/pylibcudf/pylibcudf/column.pyi
+++ b/python/pylibcudf/pylibcudf/column.pyi
@@ -6,12 +6,12 @@ from typing import Any, Protocol, TypedDict
 
 from rmm.pylibrmm.device_buffer import DeviceBuffer
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf._interop_helpers import ArrowLike, ColumnMetadata
 from pylibcudf.scalar import Scalar
 from pylibcudf.span import Span
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 class ArrayInterfaceBase(TypedDict):
     shape: tuple[int, ...]
@@ -64,7 +64,7 @@ class Column:
     def num_children(self) -> int: ...
     def copy(
         self,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> Column: ...
     def device_buffer_size(self) -> int: ...
@@ -77,19 +77,19 @@ class Column:
     def from_scalar(
         scalar: Scalar,
         size: int,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> Column: ...
     def to_scalar(
         self,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> Scalar: ...
     @staticmethod
     def all_null_like(
         like: Column,
         size: int,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> Column: ...
     @staticmethod
@@ -99,32 +99,34 @@ class Column:
     def to_arrow(
         self,
         metadata: ColumnMetadata | str | None = None,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
     ) -> ArrowLike: ...
     # Private methods below are included because polars is currently using them,
     # but we want to remove stubs for these private methods eventually
     def _to_schema(self, metadata: Any = None) -> Any: ...
-    def _to_host_array(self, stream: Stream) -> Any: ...
+    def _to_host_array(self, stream: CudaStreamLike) -> Any: ...
     @staticmethod
     def from_arrow(
         obj: ArrowLike,
         dtype: DataType | None = None,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> Column: ...
     @classmethod
     def from_cuda_array_interface(
-        cls, obj: SupportsCudaArrayInterface, stream: Stream | None = None
+        cls,
+        obj: SupportsCudaArrayInterface,
+        stream: CudaStreamLike | None = None,
     ) -> Column: ...
     @classmethod
     def from_array_interface(
-        cls, obj: SupportsArrayInterface, stream: Stream | None = None
+        cls, obj: SupportsArrayInterface, stream: CudaStreamLike | None = None
     ) -> Column: ...
     @classmethod
     def from_array(
         cls,
         obj: SupportsCudaArrayInterface | SupportsArrayInterface,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
     ) -> Column: ...
     @staticmethod
     def struct_from_children(children: Sequence[Column]) -> Column: ...
@@ -132,21 +134,23 @@ class Column:
     def from_iterable_of_py(
         obj: Iterable,
         dtype: DataType | None = None,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
     ) -> Column: ...
 
 class ListsColumnView:
     def __init__(self, column: Column): ...
     def child(self) -> Column: ...
     def offsets(self) -> Column: ...
-    def get_sliced_child(self, stream: Stream | None = None) -> Column: ...
+    def get_sliced_child(
+        self, stream: CudaStreamLike | None = None
+    ) -> Column: ...
 
 class StructsColumnView:
     def __init__(self, column: Column): ...
     def child(self) -> Column: ...
     def offsets(self) -> Column: ...
     def get_sliced_child(
-        self, index: int, stream: Stream | None = None
+        self, index: int, stream: CudaStreamLike | None = None
     ) -> Column: ...
 
 def is_c_contiguous(
diff --git a/python/pylibcudf/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx
index 96137f96256..fc8745dae26 100644
--- a/python/pylibcudf/pylibcudf/column.pyx
+++ b/python/pylibcudf/pylibcudf/column.pyx
@@ -67,6 +67,7 @@ from itertools import accumulate
 import functools
 import operator
 from typing import Iterable
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 try:
     import pyarrow as pa
@@ -96,7 +97,7 @@ cdef class _ArrowColumnHolder:
 cdef class OwnerWithCAI:
     """An interface for column view's data with gpumemoryview via CAI."""
     @staticmethod
-    cdef create(column_view cv, object owner, Stream stream):
+    cdef create(column_view cv, object owner, object stream):
         obj = OwnerWithCAI()
         obj.owner = owner
         # The default size of 0 will be applied for any type that stores data in the
@@ -108,7 +109,7 @@ cdef class OwnerWithCAI:
             # Cast to Python integers before multiplying to avoid overflow.
             size = int(cv.size()) * int(cpp_size_of(cv.type()))
         elif cv.type().id() == type_id.STRING:
-            size = strings_column_view(cv).chars_size(stream.view())
+            size = strings_column_view(cv).chars_size((<Stream>stream).view().value())
 
         obj.cai = {
             "shape": (size,),
@@ -156,7 +157,7 @@ class ArrayInterfaceWrapper:
         self.__array_interface__ = iface
 
 
-cdef gpumemoryview _copy_array_to_device(object buf, Stream stream=None):
+cdef gpumemoryview _copy_array_to_device(object buf, object stream=None):
     """
     Copy a host-side array.array buffer to device memory.
 
@@ -175,11 +176,11 @@ cdef gpumemoryview _copy_array_to_device(object buf, Stream stream=None):
     cdef memoryview mv = memoryview(buf)
     cdef uintptr_t ptr = <uintptr_t>mv.obj.buffer_info()[0]
     cdef size_t nbytes = len(mv) * mv.itemsize
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
 
     return gpumemoryview(DeviceBuffer.to_device(
         <const unsigned char[:nbytes:1]><const unsigned char*>ptr,
-        stream
+        _stream
     ))
 
 
@@ -401,7 +402,7 @@ cdef class Column:
     def from_arrow(
         obj: ArrowLike,
         dtype: DataType | None = None,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None
     ) -> ArrowLike:
         """
@@ -453,7 +454,8 @@ cdef class Column:
         cdef _ArrowColumnHolder result
         cdef unique_ptr[arrow_column] c_result
 
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
 
         if hasattr(obj, "__arrow_c_device_array__"):
@@ -469,7 +471,7 @@ cdef class Column:
                 c_result = make_unique[arrow_column](
                     move(dereference(c_schema)),
                     move(dereference(c_device_array)),
-                    stream.view(),
+                    _cs,
                     result.mr.get_mr(),
                 )
             result.col.swap(c_result)
@@ -477,7 +479,7 @@ cdef class Column:
             return Column.from_column_view_of_arbitrary(
                 result.col.get().view(),
                 result,
-                stream,
+                _stream,
             )
         elif hasattr(obj, "__arrow_c_array__"):
             schema, h_array = obj.__arrow_c_array__()
@@ -490,7 +492,7 @@ cdef class Column:
                 c_result = make_unique[arrow_column](
                     move(dereference(c_schema)),
                     move(dereference(c_array)),
-                    stream.view(),
+                    _cs,
                     result.mr.get_mr(),
                 )
             result.col.swap(c_result)
@@ -498,7 +500,7 @@ cdef class Column:
             return Column.from_column_view_of_arbitrary(
                 result.col.get().view(),
                 result,
-                stream,
+                _stream,
             )
         elif hasattr(obj, "__arrow_c_stream__"):
             arrow_stream = obj.__arrow_c_stream__()
@@ -514,7 +516,7 @@ cdef class Column:
             with nogil:
                 c_result = make_unique[arrow_column](
                     move(dereference(c_arrow_stream)),
-                    stream.view(),
+                    _cs,
                     result.mr.get_mr(),
                 )
             result.col.swap(c_result)
@@ -522,7 +524,7 @@ cdef class Column:
             return Column.from_column_view_of_arbitrary(
                 result.col.get().view(),
                 result,
-                stream,
+                _stream,
             )
         elif hasattr(obj, "__arrow_c_device_stream__"):
             # TODO: When we add support for this case, it should be moved above
@@ -656,7 +658,7 @@ cdef class Column:
     @staticmethod
     cdef Column from_libcudf(
         unique_ptr[column] libcudf_col,
-        Stream stream,
+        object stream,
         DeviceMemoryResource mr
     ):
         """Create a Column from a libcudf column.
@@ -667,6 +669,7 @@ cdef class Column:
         """
         assert stream is not None, "stream cannot be None"
         assert mr is not None, "mr cannot be None"
+        cdef Stream _stream = <Stream>stream
         cdef DataType dtype = DataType.from_libcudf(libcudf_col.get().type())
         cdef size_type size = libcudf_col.get().size()
 
@@ -677,13 +680,13 @@ cdef class Column:
         # Note that when converting to cudf Column objects we'll need to pull
         # out the base object.
         cdef gpumemoryview data = gpumemoryview(
-            DeviceBuffer.c_from_unique_ptr(move(contents.data), stream, mr)
+            DeviceBuffer.c_from_unique_ptr(move(contents.data), _stream, mr)
         )
 
         cdef gpumemoryview mask = None
         if null_count > 0:
             mask = gpumemoryview(
-                DeviceBuffer.c_from_unique_ptr(move(contents.null_mask), stream, mr)
+                DeviceBuffer.c_from_unique_ptr(move(contents.null_mask), _stream, mr)
             )
 
         children = []
@@ -772,7 +775,7 @@ cdef class Column:
     cdef Column from_column_view_of_arbitrary(
         const column_view& cv,
         object owner,
-        Stream stream,
+        object stream,
     ):
         """Create a Column from a libcudf column_view into an arbitrary owner.
 
@@ -818,7 +821,7 @@ cdef class Column:
     def from_scalar(
         Scalar slr,
         size_type size,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None,
     ):
         """Create a Column from a Scalar.
@@ -839,18 +842,19 @@ cdef class Column:
         """
         cdef const scalar* c_scalar = slr.get()
         cdef unique_ptr[column] c_result
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
         with nogil:
             c_result = make_column_from_scalar(
                 dereference(c_scalar),
                 size,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
-        return Column.from_libcudf(move(c_result), stream, mr)
+        return Column.from_libcudf(move(c_result), _stream, mr)
 
-    cpdef Scalar to_scalar(self, Stream stream=None, DeviceMemoryResource mr=None):
+    cpdef Scalar to_scalar(self, object stream=None, DeviceMemoryResource mr=None):
         """
         Return the first value of 1-element column as a Scalar.
 
@@ -873,11 +877,12 @@ cdef class Column:
 
         cdef column_view cv = self.view()
         cdef unique_ptr[scalar] result
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
 
         with nogil:
-            result = get_element(cv, 0, stream.view(), mr.get_mr())
+            result = get_element(cv, 0, _cs, mr.get_mr())
 
         return Scalar.from_libcudf(move(result))
 
@@ -885,7 +890,7 @@ cdef class Column:
     def all_null_like(
         Column like,
         size_type size,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None,
     ):
         """Create an all null column from a template.
@@ -904,18 +909,19 @@ cdef class Column:
         Column
             An all-null column of `size` rows and type matching `like`.
         """
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
-        cdef Scalar slr = Scalar.empty_like(like, stream, mr)
+        cdef Scalar slr = Scalar.empty_like(like, _stream, mr)
         cdef unique_ptr[column] c_result
         with nogil:
             c_result = make_column_from_scalar(
                 dereference(slr.get()),
                 size,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
-        return Column.from_libcudf(move(c_result), stream, mr)
+        return Column.from_libcudf(move(c_result), _stream, mr)
 
     @staticmethod
     cdef Column _wrap_nested_list_column(
@@ -923,7 +929,7 @@ cdef class Column:
         tuple shape,
         DataType dtype,
         Column base=None,
-        Stream stream=None,
+        object stream=None,
     ):
         """
         Construct a list Column from a gpumemoryview and array
@@ -937,7 +943,7 @@ cdef class Column:
         """
         ndim = len(shape)
         flat_size = functools.reduce(operator.mul, shape)
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
 
         if base is None:
             base = Column(
@@ -958,9 +964,9 @@ cdef class Column:
 
             offsets_col = sequence(
                 outer_len + 1,
-                Scalar.from_py(0, int32_dtype, stream=stream),
-                Scalar.from_py(shape[i], int32_dtype, stream=stream),
-                stream,
+                Scalar.from_py(0, int32_dtype, stream=_stream),
+                Scalar.from_py(shape[i], int32_dtype, stream=_stream),
+                _stream,
             )
 
             nested = Column(
@@ -976,7 +982,7 @@ cdef class Column:
         return nested
 
     @classmethod
-    def from_array_interface(cls, obj, Stream stream=None):
+    def from_array_interface(cls, obj, object stream=None):
         """
         Create a Column from an object implementing the NumPy Array Interface.
 
@@ -1016,21 +1022,21 @@ cdef class Column:
 
         cdef const unsigned char* ptr
         cdef const unsigned char[:] view
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
 
         if nbytes > 0:
             ptr = <const unsigned char*><uintptr_t>data_ptr
             view = (<const unsigned char[:nbytes]> ptr)[:nbytes]
-            dbuf = DeviceBuffer.to_device(view, stream)
+            dbuf = DeviceBuffer.to_device(view, _stream)
         else:
-            dbuf = DeviceBuffer(size=0, stream=stream)
+            dbuf = DeviceBuffer(size=0, stream=_stream)
 
         return Column._wrap_nested_list_column(
-            gpumemoryview(dbuf), shape, dtype, None, stream
+            gpumemoryview(dbuf), shape, dtype, None, _stream
         )
 
     @classmethod
-    def from_cuda_array_interface(cls, obj, Stream stream=None):
+    def from_cuda_array_interface(cls, obj, object stream=None):
         """
         Create a Column from an object implementing the CUDA Array Interface.
 
@@ -1069,7 +1075,7 @@ cdef class Column:
         )
 
     @classmethod
-    def from_array(cls, obj, Stream stream=None):
+    def from_array(cls, obj, object stream=None):
         """
         Create a Column from any object which supports the NumPy
         or CUDA array interface.
@@ -1115,7 +1121,7 @@ cdef class Column:
     def from_iterable_of_py(
         obj: Iterable,
         dtype: DataType | None = None,
-        Stream stream=None
+        object stream=None
     ) -> Column:
         """
         Create a Column from a Python iterable of scalar values or nested iterables.
@@ -1364,14 +1370,15 @@ cdef class Column:
         """The children of the column."""
         return self._children
 
-    cpdef Column copy(self, Stream stream=None, DeviceMemoryResource mr=None):
+    cpdef Column copy(self, object stream=None, DeviceMemoryResource mr=None):
         """Create a copy of the column."""
         cdef unique_ptr[column] c_result
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
         with nogil:
-            c_result = make_unique[column](self.view(), stream.view(), mr.get_mr())
-        return Column.from_libcudf(move(c_result), stream, mr)
+            c_result = make_unique[column](self.view(), _cs, mr.get_mr())
+        return Column.from_libcudf(move(c_result), _stream, mr)
 
     cpdef uint64_t device_buffer_size(self):
         """
@@ -1419,10 +1426,12 @@ cdef class Column:
 
         return PyCapsule_New(<void*>raw_schema_ptr, 'arrow_schema', _release_schema)
 
-    def _to_host_array(self, Stream stream):
+    def _to_host_array(self, object stream):
         cdef ArrowArray* raw_host_array_ptr
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         with nogil:
-            raw_host_array_ptr = to_arrow_host_raw(self.view(), stream.view())
+            raw_host_array_ptr = to_arrow_host_raw(self.view(), _cs)
 
         return PyCapsule_New(<void*>raw_host_array_ptr, "arrow_array", _release_array)
 
@@ -1484,7 +1493,7 @@ cdef class ListsColumnView:
         """
         return lists_column_view(self._column.view())
 
-    cpdef Column get_sliced_child(self, Stream stream=None):
+    cpdef Column get_sliced_child(self, object stream=None):
         """
         Get the list elements child properly sliced to match parent's view.
 
@@ -1498,9 +1507,9 @@ cdef class ListsColumnView:
         Column
             The sliced elements column
         """
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
 
-        cdef column_view c_child = self.view().get_sliced_child(stream.view())
+        cdef column_view c_child = self.view().get_sliced_child(_stream.view().value())
         return Column.from_column_view(c_child, self._column.child(1))
 
 
@@ -1522,7 +1531,7 @@ cdef class StructsColumnView:
         """
         return structs_column_view(self._column.view())
 
-    cpdef Column get_sliced_child(self, int index, Stream stream=None):
+    cpdef Column get_sliced_child(self, int index, object stream=None):
         """
         Get the struct elements child properly sliced to match parent's view.
 
@@ -1538,9 +1547,10 @@ cdef class StructsColumnView:
         Column
             The sliced elements column
         """
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
 
-        cdef column_view c_child = self.view().get_sliced_child(index, stream.view())
+        cdef cudaStream_t _cs = _stream.view().value()
+        cdef column_view c_child = self.view().get_sliced_child(index, _cs)
         return Column.from_column_view(c_child, self._column.child(index))
 
 
diff --git a/python/pylibcudf/pylibcudf/column_factories.pxd b/python/pylibcudf/pylibcudf/column_factories.pxd
index d26b3396e30..3f9841c045d 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pxd
+++ b/python/pylibcudf/pylibcudf/column_factories.pxd
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 from pylibcudf.libcudf.types cimport mask_state
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 from .column cimport Column
 from .types cimport DataType, size_type, type_id
@@ -20,7 +19,7 @@ cpdef Column make_numeric_column(
     DataType type_,
     size_type size,
     MaskArg mask,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -28,7 +27,7 @@ cpdef Column make_fixed_point_column(
     DataType type_,
     size_type size,
     MaskArg mask,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -36,7 +35,7 @@ cpdef Column make_timestamp_column(
     DataType type_,
     size_type size,
     MaskArg mask,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -44,7 +43,7 @@ cpdef Column make_duration_column(
     DataType type_,
     size_type size,
     MaskArg mask,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -52,18 +51,18 @@ cpdef Column make_fixed_width_column(
     DataType type_,
     size_type size,
     MaskArg mask,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef Column make_empty_column(
     MakeEmptyColumnOperand type_or_id,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef Column make_empty_lists_column(
     DataType child_type,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
diff --git a/python/pylibcudf/pylibcudf/column_factories.pyi b/python/pylibcudf/pylibcudf/column_factories.pyi
index 66d46d88949..a9e92c5f823 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pyi
+++ b/python/pylibcudf/pylibcudf/column_factories.pyi
@@ -1,53 +1,53 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.types import DataType, MaskState, TypeId
+from pylibcudf.utils import CudaStreamLike
 
 def make_numeric_column(
     type_: DataType,
     size: int,
     mstate: MaskState,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def make_fixed_point_column(
     type_: DataType,
     size: int,
     mstate: MaskState,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def make_timestamp_column(
     type_: DataType,
     size: int,
     mstate: MaskState,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def make_duration_column(
     type_: DataType,
     size: int,
     mstate: MaskState,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def make_fixed_width_column(
     type_: DataType,
     size: int,
     mstate: MaskState,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def make_empty_column(
     type_or_id: DataType | TypeId,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def make_empty_lists_column(
     child_type: DataType,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx
index 0848f1aff03..45d590f4106 100644
--- a/python/pylibcudf/pylibcudf/column_factories.pyx
+++ b/python/pylibcudf/pylibcudf/column_factories.pyx
@@ -20,6 +20,7 @@ from .types cimport DataType, type_id
 
 from .types import MaskState, TypeId
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 __all__ = [
@@ -34,7 +35,7 @@ __all__ = [
 
 cpdef Column make_empty_column(
     MakeEmptyColumnOperand type_or_id,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Creates an empty column of the specified type.
@@ -53,7 +54,7 @@ cpdef Column make_empty_column(
     """
     cdef unique_ptr[column] result
     cdef type_id id
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
     mr = _get_memory_resource(mr)
 
     if MakeEmptyColumnOperand is object:
@@ -75,14 +76,14 @@ cpdef Column make_empty_column(
         raise TypeError(
             "Must pass a TypeId or DataType"
         )
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column make_numeric_column(
     DataType type_,
     size_type size,
     MaskArg mstate,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Creates an empty numeric column.
@@ -102,7 +103,8 @@ cpdef Column make_numeric_column(
         state = mstate
     else:
         raise TypeError("Invalid mask argument")
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -110,17 +112,17 @@ cpdef Column make_numeric_column(
             type_.c_obj,
             size,
             state,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef Column make_fixed_point_column(
     DataType type_,
     size_type size,
     MaskArg mstate,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
 
@@ -136,7 +138,8 @@ cpdef Column make_fixed_point_column(
         state = mstate
     else:
         raise TypeError("Invalid mask argument")
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -144,18 +147,18 @@ cpdef Column make_fixed_point_column(
             type_.c_obj,
             size,
             state,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column make_timestamp_column(
     DataType type_,
     size_type size,
     MaskArg mstate,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
 
@@ -171,7 +174,8 @@ cpdef Column make_timestamp_column(
         state = mstate
     else:
         raise TypeError("Invalid mask argument")
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -179,18 +183,18 @@ cpdef Column make_timestamp_column(
             type_.c_obj,
             size,
             state,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column make_duration_column(
     DataType type_,
     size_type size,
     MaskArg mstate,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
 
@@ -206,7 +210,8 @@ cpdef Column make_duration_column(
         state = mstate
     else:
         raise TypeError("Invalid mask argument")
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -214,18 +219,18 @@ cpdef Column make_duration_column(
             type_.c_obj,
             size,
             state,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column make_fixed_width_column(
     DataType type_,
     size_type size,
     MaskArg mstate,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
 
@@ -241,7 +246,8 @@ cpdef Column make_fixed_width_column(
         state = mstate
     else:
         raise TypeError("Invalid mask argument")
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -249,16 +255,16 @@ cpdef Column make_fixed_width_column(
             type_.c_obj,
             size,
             state,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column make_empty_lists_column(
     DataType child_type,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Creates an empty column of the specified type.
@@ -276,10 +282,10 @@ cpdef Column make_empty_lists_column(
         An empty Column
     """
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
     mr = _get_memory_resource(mr)
 
     with nogil:
         result = cpp_make_empty_lists_column(child_type.c_obj)
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/concatenate.pxd
index 60adf27c9a3..60189ba4406 100644
--- a/python/pylibcudf/pylibcudf/concatenate.pxd
+++ b/python/pylibcudf/pylibcudf/concatenate.pxd
@@ -1,9 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from .table cimport Table
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 
@@ -11,4 +10,4 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 # unify the column and table paths without using runtime dispatch instead. In this case
 # we choose to prioritize API consistency over performance, so we use the same function
 # with a bit of runtime dispatch overhead.
-cpdef concatenate(list objects, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef concatenate(list objects, object stream = *, DeviceMemoryResource mr=*)
diff --git a/python/pylibcudf/pylibcudf/concatenate.pyi b/python/pylibcudf/pylibcudf/concatenate.pyi
index 18e8bff2e2f..59379e01c46 100644
--- a/python/pylibcudf/pylibcudf/concatenate.pyi
+++ b/python/pylibcudf/pylibcudf/concatenate.pyi
@@ -1,14 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 def concatenate[ColumnOrTable: (Column, Table)](
     objects: list[ColumnOrTable],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> ColumnOrTable: ...
diff --git a/python/pylibcudf/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx
index 36fa0984a68..9921d5b1a39 100644
--- a/python/pylibcudf/pylibcudf/concatenate.pyx
+++ b/python/pylibcudf/pylibcudf/concatenate.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -16,10 +16,11 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from .column cimport Column
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["concatenate"]
 
-cpdef concatenate(list objects, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef concatenate(list objects, object stream=None, DeviceMemoryResource mr=None):
     """Concatenate columns or tables.
 
     Parameters
@@ -41,7 +42,8 @@ cpdef concatenate(list objects, Stream stream=None, DeviceMemoryResource mr=None
 
     cdef vector[column_view] c_columns
     cdef vector[table_view] c_tables
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     cdef unique_ptr[column] c_col_result
@@ -53,17 +55,17 @@ cpdef concatenate(list objects, Stream stream=None, DeviceMemoryResource mr=None
 
         with nogil:
             c_tbl_result = cpp_concatenate.concatenate(
-                c_tables, stream.view(), mr.get_mr()
+                c_tables, _cs, mr.get_mr()
             )
-        return Table.from_libcudf(move(c_tbl_result), stream, mr)
+        return Table.from_libcudf(move(c_tbl_result), _stream, mr)
     elif isinstance(objects[0], Column):
         for column in objects:
             c_columns.push_back((<Column?>column).view())
 
         with nogil:
             c_col_result = cpp_concatenate.concatenate(
-                c_columns, stream.view(), mr.get_mr()
+                c_columns, _cs, mr.get_mr()
             )
-        return Column.from_libcudf(move(c_col_result), stream, mr)
+        return Column.from_libcudf(move(c_col_result), _stream, mr)
     else:
         raise ValueError("input must be a list of Columns or Tables")
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/contiguous_split.pxd
index a294e70a4a6..95259723dfa 100644
--- a/python/pylibcudf/pylibcudf/contiguous_split.pxd
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pxd
@@ -32,13 +32,13 @@ cdef class HostBuffer:
 
 cdef class PackedColumns:
     cdef unique_ptr[packed_columns] c_obj
-    cdef Stream stream
+    cdef object stream
     cdef DeviceMemoryResource mr
 
     @staticmethod
     cdef PackedColumns from_libcudf(
         unique_ptr[packed_columns] data,
-        Stream stream,
+        object stream,
         DeviceMemoryResource mr
     )
     cpdef tuple release(self)
@@ -58,10 +58,10 @@ cdef class ChunkedPack:
 
 cpdef PackedColumns pack(Table input)
 
-cpdef Table unpack(PackedColumns input, Stream stream=*)
+cpdef Table unpack(PackedColumns input, object stream = *)
 
 cpdef Table unpack_from_memoryviews(
     memoryview metadata,
     object gpu_data,
-    Stream stream=*,
+    object stream = *,
 )
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyi b/python/pylibcudf/pylibcudf/contiguous_split.pyi
index df241c079ae..6e0e653b5bb 100644
--- a/python/pylibcudf/pylibcudf/contiguous_split.pyi
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pyi
@@ -2,28 +2,30 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.mr import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.gpumemoryview import gpumemoryview
 from pylibcudf.span import Span
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 class PackedColumns:
     def __init__(self): ...
     def release(
-        self, stream: Stream | None = None
+        self, stream: CudaStreamLike | None = None
     ) -> tuple[memoryview[bytes], gpumemoryview]: ...
 
 def pack(
     input: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> PackedColumns: ...
-def unpack(input: PackedColumns, stream: Stream | None = None) -> Table: ...
+def unpack(
+    input: PackedColumns, stream: CudaStreamLike | None = None
+) -> Table: ...
 def unpack_from_memoryviews(
     metadata: memoryview[bytes],
     gpu_data: Span,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
 ) -> Table: ...
 
 class ChunkedPack:
@@ -32,7 +34,7 @@ class ChunkedPack:
     def create(
         input: Table,
         user_buffer_size: int,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         temp_mr: DeviceMemoryResource | None = None,
     ) -> ChunkedPack: ...
     def has_next(self) -> bool: ...
diff --git a/python/pylibcudf/pylibcudf/contiguous_split.pyx b/python/pylibcudf/pylibcudf/contiguous_split.pyx
index 6b24def5dc8..239d89d6470 100644
--- a/python/pylibcudf/pylibcudf/contiguous_split.pyx
+++ b/python/pylibcudf/pylibcudf/contiguous_split.pyx
@@ -15,6 +15,8 @@ from cuda.bindings.cyruntime cimport (
     cudaError_t,
     cudaMemcpyAsync,
     cudaMemcpyKind,
+    cudaStream_t,
+    cudaStreamSynchronize,
 )
 
 from pylibcudf.libcudf.contiguous_split cimport (
@@ -27,7 +29,6 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.utilities.span cimport device_span
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
@@ -36,6 +37,7 @@ from .gpumemoryview cimport gpumemoryview
 from .table cimport Table
 from .span import is_span
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 __all__ = [
@@ -105,7 +107,7 @@ cdef class PackedColumns:
     @staticmethod
     cdef PackedColumns from_libcudf(
         unique_ptr[packed_columns] data,
-        Stream stream,
+        object stream,
         DeviceMemoryResource mr
     ):
         """Create a Python PackedColumns from a libcudf packed_columns."""
@@ -163,7 +165,7 @@ cdef class ChunkedPack:
     def create(
         Table input,
         size_t user_buffer_size,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource temp_mr=None,
     ):
         """
@@ -184,16 +186,16 @@ cdef class ChunkedPack:
         -------
         New ChunkedPack object.
         """
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
         temp_mr = _get_memory_resource(temp_mr)
         cdef unique_ptr[chunked_pack] obj = chunked_pack.create(
-            input.view(), user_buffer_size, stream.view(), temp_mr.get_mr()
+            input.view(), user_buffer_size, _stream.view().value(), temp_mr.get_mr()
         )
 
         cdef ChunkedPack out = ChunkedPack.__new__(ChunkedPack)
         out.table = input
         out.mr = temp_mr
-        out.stream = stream
+        out.stream = _stream
         out.c_obj = move(obj)
         return out
 
@@ -292,7 +294,8 @@ cdef class ChunkedPack:
                 dereference(self.c_obj).get_total_contiguous_size()
             )
         )
-        cdef cuda_stream_view stream = self.stream.view()
+        cdef Stream py_stream = self.stream
+        cdef cudaStream_t stream = py_stream.view().value()
         with nogil:
             while dereference(self.c_obj).has_next():
                 size = dereference(self.c_obj).next(d_span)
@@ -301,22 +304,22 @@ cdef class ChunkedPack:
                     d_span.data(),
                     size,
                     cudaMemcpyKind.cudaMemcpyDeviceToHost,
-                    stream.value(),
+                    stream,
                 )
                 offset += size
                 if err != cudaError.cudaSuccess:
-                    stream.synchronize()
+                    cudaStreamSynchronize(stream)
                     raise RuntimeError(
                         f"Memcpy in pack_to_host failed error: {err}"
                     )
-        stream.synchronize()
+        cudaStreamSynchronize(stream)
         return (
             self.build_metadata(),
             memoryview(HostBuffer.from_unique_ptr(move(h_buf))),
         )
 
 
-cpdef PackedColumns pack(Table input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef PackedColumns pack(Table input, object stream=None, DeviceMemoryResource mr=None):
     """Deep-copy a table into a serialized contiguous memory format.
 
     Later use `unpack` or `unpack_from_memoryviews` to unpack the serialized
@@ -346,16 +349,17 @@ cpdef PackedColumns pack(Table input, Stream stream=None, DeviceMemoryResource m
     For details, see :cpp:func:`pack`.
     """
     cdef unique_ptr[packed_columns] pack
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
         pack = move(make_unique[packed_columns](
-            cpp_pack(input.view(), stream.view(), mr.get_mr())
+            cpp_pack(input.view(), _cs, mr.get_mr())
         ))
-    return PackedColumns.from_libcudf(move(pack), stream, mr)
+    return PackedColumns.from_libcudf(move(pack), _stream, mr)
 
 
-cpdef Table unpack(PackedColumns input, Stream stream=None):
+cpdef Table unpack(PackedColumns input, object stream=None):
     """Deserialize the result of `pack`.
 
     Copies the result of a serialized table into a table.
@@ -375,16 +379,16 @@ cpdef Table unpack(PackedColumns input, Stream stream=None):
         Copy of the packed columns.
     """
     cdef table_view v
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
     with nogil:
         v = cpp_unpack(dereference(input.c_obj))
-    return Table.from_table_view_of_arbitrary(v, input, stream)
+    return Table.from_table_view_of_arbitrary(v, input, _stream)
 
 
 cpdef Table unpack_from_memoryviews(
     memoryview metadata,
     object gpu_data,
-    Stream stream=None,
+    object stream=None,
 ):
     """Deserialize the result of `pack`.
 
@@ -406,7 +410,7 @@ cpdef Table unpack_from_memoryviews(
     Table
         Copy of the packed columns.
     """
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
     cdef device_span[uint8_t] d_span = _get_device_span(gpu_data)
 
     if metadata.nbytes == 0:
@@ -416,7 +420,7 @@ cpdef Table unpack_from_memoryviews(
         # used for any operations.
         return Table.from_libcudf(
             make_unique[table](table_view()),
-            stream,
+            _stream,
             _get_memory_resource(),
         )
 
@@ -428,4 +432,4 @@ cpdef Table unpack_from_memoryviews(
     cdef table_view v
     with nogil:
         v = cpp_unpack(metadata_ptr, gpu_data_ptr)
-    return Table.from_table_view_of_arbitrary(v, gpu_data, stream)
+    return Table.from_table_view_of_arbitrary(v, gpu_data, _stream)
diff --git a/python/pylibcudf/pylibcudf/copying.pxd b/python/pylibcudf/pylibcudf/copying.pxd
index caaa590de15..4143e846994 100644
--- a/python/pylibcudf/pylibcudf/copying.pxd
+++ b/python/pylibcudf/pylibcudf/copying.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool as cbool
@@ -9,7 +9,6 @@ from pylibcudf.libcudf.copying cimport (
 from pylibcudf.libcudf.types cimport size_type
 
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 from .column cimport Column
 from .scalar cimport Scalar
@@ -40,7 +39,7 @@ cpdef Table gather(
     Table source_table,
     Column gather_map,
     out_of_bounds_policy bounds_policy,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -48,19 +47,19 @@ cpdef Table scatter(
     TableOrListOfScalars source,
     Column scatter_map,
     Table target_table,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef ColumnOrTable empty_like(
-    ColumnOrTable input, Stream stream=*, DeviceMemoryResource mr=*
+    ColumnOrTable input, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column allocate_like(
     Column input_column,
     mask_allocation_policy policy,
     size=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -70,7 +69,7 @@ cpdef Column copy_range_in_place(
     size_type input_begin,
     size_type input_end,
     size_type target_begin,
-    Stream stream=*,
+    object stream = *,
 )
 
 cpdef Column copy_range(
@@ -79,7 +78,7 @@ cpdef Column copy_range(
     size_type input_begin,
     size_type input_end,
     size_type target_begin,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -87,19 +86,19 @@ cpdef Column shift(
     Column input,
     size_type offset,
     Scalar fill_value,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
-cpdef list slice(ColumnOrTable input, list indices, Stream stream=*)
+cpdef list slice(ColumnOrTable input, list indices, object stream = *)
 
-cpdef list split(ColumnOrTable input, list splits, Stream stream=*)
+cpdef list split(ColumnOrTable input, list splits, object stream = *)
 
 cpdef Column copy_if_else(
     LeftCopyIfElseOperand lhs,
     RightCopyIfElseOperand rhs,
     Column boolean_mask,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -107,13 +106,13 @@ cpdef Table boolean_mask_scatter(
     TableOrListOfScalars input,
     Table target,
     Column boolean_mask,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Scalar get_element(
     Column input_column,
     size_type index,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/copying.pyi b/python/pylibcudf/pylibcudf/copying.pyi
index 04acecc2f1b..bdff6cddad5 100644
--- a/python/pylibcudf/pylibcudf/copying.pyi
+++ b/python/pylibcudf/pylibcudf/copying.pyi
@@ -1,15 +1,15 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from enum import IntEnum
 from typing import TypeVar
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 class MaskAllocationPolicy(IntEnum):
     NEVER = ...
@@ -26,26 +26,26 @@ def gather(
     source_table: Table,
     gather_map: Column,
     bounds_policy: OutOfBoundsPolicy,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def scatter(
     source: Table | list[Scalar],
     scatter_map: Column,
     target_table: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def empty_like(
     input: ColumnOrTable,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> ColumnOrTable: ...
 def allocate_like(
     input_column: Column,
     policy: MaskAllocationPolicy,
     size: int | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def copy_range_in_place(
@@ -54,7 +54,7 @@ def copy_range_in_place(
     input_begin: int,
     input_end: int,
     target_begin: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
 ) -> Column: ...
 def copy_range(
     input_column: Column,
@@ -62,39 +62,43 @@ def copy_range(
     input_begin: int,
     input_end: int,
     target_begin: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def shift(
     input: Column,
     offset: int,
     fill_value: Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def slice(
-    input: ColumnOrTable, indices: list[int], stream: Stream | None = None
+    input: ColumnOrTable,
+    indices: list[int],
+    stream: CudaStreamLike | None = None,
 ) -> list[ColumnOrTable]: ...
 def split(
-    input: ColumnOrTable, splits: list[int], stream: Stream | None = None
+    input: ColumnOrTable,
+    splits: list[int],
+    stream: CudaStreamLike | None = None,
 ) -> list[ColumnOrTable]: ...
 def copy_if_else(
     lhs: Column | Scalar,
     rhs: Column | Scalar,
     boolean_mask: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def boolean_mask_scatter(
     input: Table | list[Scalar],
     target: Table,
     boolean_mask: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def get_element(
     input_column: Column,
     index: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Scalar: ...
diff --git a/python/pylibcudf/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx
index f8f44e03938..30be1ea7d0a 100644
--- a/python/pylibcudf/pylibcudf/copying.pyx
+++ b/python/pylibcudf/pylibcudf/copying.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator import dereference
@@ -40,6 +40,7 @@ from .column cimport Column
 from .scalar cimport Scalar
 from .table cimport Table
 from .utils cimport _as_vector, _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 __all__ = [
@@ -64,7 +65,7 @@ cpdef Table gather(
     Table source_table,
     Column gather_map,
     out_of_bounds_policy bounds_policy,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Select rows from source_table according to the provided gather_map.
@@ -94,7 +95,8 @@ cpdef Table gather(
         If the gather_map contains nulls.
     """
     cdef unique_ptr[table] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -102,18 +104,18 @@ cpdef Table gather(
             source_table.view(),
             gather_map.view(),
             bounds_policy,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table scatter(
     TableOrListOfScalars source,
     Column scatter_map,
     Table target_table,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Scatter from source into target_table according to scatter_map.
@@ -155,7 +157,8 @@ cpdef Table scatter(
     """
     cdef unique_ptr[table] c_result
     cdef vector[reference_wrapper[const scalar]] source_scalars
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if TableOrListOfScalars is Table:
@@ -164,7 +167,7 @@ cpdef Table scatter(
                 source.view(),
                 scatter_map.view(),
                 target_table.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
@@ -174,14 +177,14 @@ cpdef Table scatter(
                 source_scalars,
                 scatter_map.view(),
                 target_table.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef ColumnOrTable empty_like(
-    ColumnOrTable input, Stream stream=None, DeviceMemoryResource mr=None
+    ColumnOrTable input, object stream=None, DeviceMemoryResource mr=None
 ):
     """Create an empty column or table with the same type as ``input``.
 
@@ -201,23 +204,23 @@ cpdef ColumnOrTable empty_like(
     """
     cdef unique_ptr[table] c_tbl_result
     cdef unique_ptr[column] c_col_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
     mr = _get_memory_resource(mr)
     if ColumnOrTable is Column:
         with nogil:
             c_col_result = cpp_copying.empty_like(input.view())
-        return Column.from_libcudf(move(c_col_result), stream, mr)
+        return Column.from_libcudf(move(c_col_result), _stream, mr)
     else:
         with nogil:
             c_tbl_result = cpp_copying.empty_like(input.view())
-        return Table.from_libcudf(move(c_tbl_result), stream, mr)
+        return Table.from_libcudf(move(c_tbl_result), _stream, mr)
 
 
 cpdef Column allocate_like(
     Column input_column,
     mask_allocation_policy policy,
     size=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Allocate a column with the same type as input_column.
@@ -244,7 +247,8 @@ cpdef Column allocate_like(
 
     cdef unique_ptr[column] c_result
     cdef size_type c_size = size if size is not None else input_column.size()
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -252,11 +256,11 @@ cpdef Column allocate_like(
                 input_column.view(),
                 c_size,
                 policy,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column copy_range_in_place(
@@ -265,7 +269,7 @@ cpdef Column copy_range_in_place(
     size_type input_begin,
     size_type input_end,
     size_type target_begin,
-    Stream stream=None
+    object stream=None
 ):
     """Copy a range of elements from input_column to target_column.
 
@@ -301,7 +305,8 @@ cpdef Column copy_range_in_place(
     """
 
     cdef mutable_column_view target_view = target_column.mutable_view()
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
 
     with nogil:
         cpp_copying.copy_range_in_place(
@@ -310,7 +315,7 @@ cpdef Column copy_range_in_place(
             input_begin,
             input_end,
             target_begin,
-            stream.view()
+            _cs
         )
     target_column.set_null_count(target_view.null_count())
 
@@ -321,7 +326,7 @@ cpdef Column copy_range(
     size_type input_begin,
     size_type input_end,
     size_type target_begin,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Copy a range of elements from input_column to target_column.
@@ -357,7 +362,8 @@ cpdef Column copy_range(
         If target and source have different types.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -367,18 +373,18 @@ cpdef Column copy_range(
             input_begin,
             input_end,
             target_begin,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column shift(
     Column input,
     size_type offset,
     Scalar fill_value,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Shift the elements of input by offset.
@@ -409,7 +415,8 @@ cpdef Column shift(
         of fixed width or string type.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -417,13 +424,13 @@ cpdef Column shift(
                 input.view(),
                 offset,
                 dereference(fill_value.c_obj),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
-cpdef list slice(ColumnOrTable input, list indices, Stream stream=None):
+cpdef list slice(ColumnOrTable input, list indices, object stream=None):
     """Slice input according to indices.
 
     For details on the implementation, see :cpp:func:`slice`.
@@ -454,11 +461,12 @@ cpdef list slice(ColumnOrTable input, list indices, Stream stream=None):
     cdef vector[column_view] c_col_result
     cdef vector[table_view] c_tbl_result
     cdef int i
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
 
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = cpp_copying.slice(input.view(), c_indices, stream.view())
+            c_col_result = cpp_copying.slice(input.view(), c_indices, _cs)
 
         return [
             Column.from_column_view(c_col_result[i], input)
@@ -466,7 +474,7 @@ cpdef list slice(ColumnOrTable input, list indices, Stream stream=None):
         ]
     else:
         with nogil:
-            c_tbl_result = cpp_copying.slice(input.view(), c_indices, stream.view())
+            c_tbl_result = cpp_copying.slice(input.view(), c_indices, _cs)
 
         return [
             Table.from_table_view(c_tbl_result[i], input)
@@ -474,7 +482,7 @@ cpdef list slice(ColumnOrTable input, list indices, Stream stream=None):
         ]
 
 
-cpdef list split(ColumnOrTable input, list splits, Stream stream=None):
+cpdef list split(ColumnOrTable input, list splits, object stream=None):
     """Split input into multiple.
 
     For details on the implementation, see :cpp:func:`split`.
@@ -497,11 +505,12 @@ cpdef list split(ColumnOrTable input, list splits, Stream stream=None):
     cdef vector[column_view] c_col_result
     cdef vector[table_view] c_tbl_result
     cdef int i
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
 
     if ColumnOrTable is Column:
         with nogil:
-            c_col_result = cpp_copying.split(input.view(), c_splits, stream.view())
+            c_col_result = cpp_copying.split(input.view(), c_splits, _cs)
 
         return [
             Column.from_column_view(c_col_result[i], input)
@@ -509,7 +518,7 @@ cpdef list split(ColumnOrTable input, list splits, Stream stream=None):
         ]
     else:
         with nogil:
-            c_tbl_result = cpp_copying.split(input.view(), c_splits, stream.view())
+            c_tbl_result = cpp_copying.split(input.view(), c_splits, _cs)
 
         return [
             Table.from_table_view(c_tbl_result[i], input)
@@ -521,7 +530,7 @@ cpdef Column copy_if_else(
     LeftCopyIfElseOperand lhs,
     RightCopyIfElseOperand rhs,
     Column boolean_mask,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Copy elements from lhs or rhs into a new column according to boolean_mask.
@@ -556,7 +565,8 @@ cpdef Column copy_if_else(
         columns), or if lhs and rhs are not of the same length (if both are columns).
     """
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Column:
@@ -565,7 +575,7 @@ cpdef Column copy_if_else(
                 lhs.view(),
                 rhs.view(),
                 boolean_mask.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     elif LeftCopyIfElseOperand is Column and RightCopyIfElseOperand is Scalar:
@@ -574,7 +584,7 @@ cpdef Column copy_if_else(
                 lhs.view(),
                 dereference(rhs.c_obj),
                 boolean_mask.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     elif LeftCopyIfElseOperand is Scalar and RightCopyIfElseOperand is Column:
@@ -583,7 +593,7 @@ cpdef Column copy_if_else(
                 dereference(lhs.c_obj),
                 rhs.view(),
                 boolean_mask.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
@@ -592,18 +602,18 @@ cpdef Column copy_if_else(
                 dereference(lhs.c_obj),
                 dereference(rhs.c_obj),
                 boolean_mask.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Table boolean_mask_scatter(
     TableOrListOfScalars input,
     Table target,
     Column boolean_mask,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Scatter rows from input into target according to boolean_mask.
@@ -641,7 +651,8 @@ cpdef Table boolean_mask_scatter(
     """
     cdef unique_ptr[table] result
     cdef vector[reference_wrapper[const scalar]] source_scalars
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if TableOrListOfScalars is Table:
@@ -650,7 +661,7 @@ cpdef Table boolean_mask_scatter(
                 input.view(),
                 target.view(),
                 boolean_mask.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
@@ -660,17 +671,17 @@ cpdef Table boolean_mask_scatter(
                 source_scalars,
                 target.view(),
                 boolean_mask.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
 
-    return Table.from_libcudf(move(result), stream, mr)
+    return Table.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Scalar get_element(
     Column input_column,
     size_type index,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Get the element at index from input_column.
@@ -697,12 +708,13 @@ cpdef Scalar get_element(
         If index is out of bounds.
     """
     cdef unique_ptr[scalar] c_output
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_output = cpp_copying.get_element(
-            input_column.view(), index, stream.view(), mr.get_mr()
+            input_column.view(), index, _cs, mr.get_mr()
         )
 
     return Scalar.from_libcudf(move(c_output))
diff --git a/python/pylibcudf/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd
index 1a93ee62c43..d7d15f0c19f 100644
--- a/python/pylibcudf/pylibcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/datetime.pxd
@@ -1,11 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.datetime cimport datetime_component, rounding_frequency
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 ctypedef fused ColumnOrScalar:
     Column
@@ -14,54 +13,54 @@ ctypedef fused ColumnOrScalar:
 cpdef Column extract_datetime_component(
     Column input,
     datetime_component component,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef Column ceil_datetimes(
     Column input,
     rounding_frequency freq,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef Column floor_datetimes(
     Column input,
     rounding_frequency freq,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef Column round_datetimes(
     Column input,
     rounding_frequency freq,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef Column add_calendrical_months(
     Column timestamps,
     ColumnOrScalar months,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef Column day_of_year(
-    Column input, Stream stream = *, DeviceMemoryResource mr = *
+    Column input, object stream = *, DeviceMemoryResource mr = *
 )
 
 cpdef Column is_leap_year(
-    Column input, Stream stream = *, DeviceMemoryResource mr = *
+    Column input, object stream = *, DeviceMemoryResource mr = *
 )
 
 cpdef Column last_day_of_month(
-    Column input, Stream stream = *, DeviceMemoryResource mr = *
+    Column input, object stream = *, DeviceMemoryResource mr = *
 )
 
 cpdef Column extract_quarter(
-    Column input, Stream stream = *, DeviceMemoryResource mr = *
+    Column input, object stream = *, DeviceMemoryResource mr = *
 )
 
 cpdef Column days_in_month(
-    Column input, Stream stream = *, DeviceMemoryResource mr = *
+    Column input, object stream = *, DeviceMemoryResource mr = *
 )
diff --git a/python/pylibcudf/pylibcudf/datetime.pyi b/python/pylibcudf/pylibcudf/datetime.pyi
index abcc608daa4..e671d2d18cf 100644
--- a/python/pylibcudf/pylibcudf/datetime.pyi
+++ b/python/pylibcudf/pylibcudf/datetime.pyi
@@ -1,13 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 class DatetimeComponent(IntEnum):
     YEAR = ...
@@ -33,55 +33,55 @@ class RoundingFrequency(IntEnum):
 def extract_datetime_component(
     input: Column,
     component: DatetimeComponent,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def ceil_datetimes(
     input: Column,
     freq: RoundingFrequency,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def floor_datetimes(
     input: Column,
     freq: RoundingFrequency,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def round_datetimes(
     input: Column,
     freq: RoundingFrequency,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def add_calendrical_months(
     input: Column,
     months: Column | Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def day_of_year(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_leap_year(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def last_day_of_month(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def extract_quarter(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def days_in_month(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
index 2a837c5b749..1e5270bad92 100644
--- a/python/pylibcudf/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -30,6 +30,7 @@ from rmm.pylibrmm.stream cimport Stream
 from .column cimport Column
 from .scalar cimport Scalar
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "DatetimeComponent",
@@ -49,7 +50,7 @@ __all__ = [
 cpdef Column extract_datetime_component(
     Column input,
     datetime_component component,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -73,19 +74,20 @@ cpdef Column extract_datetime_component(
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         result = cpp_extract_datetime_component(
-            input.view(), component, stream.view(), mr.get_mr()
+            input.view(), component, _cs, mr.get_mr()
         )
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef Column ceil_datetimes(
     Column input,
     rounding_frequency freq,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -109,17 +111,18 @@ cpdef Column ceil_datetimes(
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_ceil_datetimes(input.view(), freq, stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(result), stream, mr)
+        result = cpp_ceil_datetimes(input.view(), freq, _cs, mr.get_mr())
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef Column floor_datetimes(
     Column input,
     rounding_frequency freq,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -143,17 +146,18 @@ cpdef Column floor_datetimes(
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_floor_datetimes(input.view(), freq, stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(result), stream, mr)
+        result = cpp_floor_datetimes(input.view(), freq, _cs, mr.get_mr())
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef Column round_datetimes(
     Column input,
     rounding_frequency freq,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -177,17 +181,18 @@ cpdef Column round_datetimes(
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_round_datetimes(input.view(), freq, stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(result), stream, mr)
+        result = cpp_round_datetimes(input.view(), freq, _cs, mr.get_mr())
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef Column add_calendrical_months(
     Column input,
     ColumnOrScalar months,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -216,7 +221,8 @@ cpdef Column add_calendrical_months(
 
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -224,13 +230,13 @@ cpdef Column add_calendrical_months(
             input.view(),
             months.view() if ColumnOrScalar is Column else
             dereference(months.get()),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef Column day_of_year(
-    Column input, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Computes the day number since the start of
@@ -253,15 +259,16 @@ cpdef Column day_of_year(
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_day_of_year(input.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(result), stream, mr)
+        result = cpp_day_of_year(input.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef Column is_leap_year(
-    Column input, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Check if the year of the given date is a leap year.
@@ -283,15 +290,16 @@ cpdef Column is_leap_year(
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_is_leap_year(input.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(result), stream, mr)
+        result = cpp_is_leap_year(input.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef Column last_day_of_month(
-    Column input, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Computes the last day of the month.
@@ -313,15 +321,16 @@ cpdef Column last_day_of_month(
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_last_day_of_month(input.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(result), stream, mr)
+        result = cpp_last_day_of_month(input.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef Column extract_quarter(
-    Column input, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns the quarter (ie. a value from {1, 2, 3, 4})
@@ -343,15 +352,16 @@ cpdef Column extract_quarter(
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_extract_quarter(input.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(result), stream, mr)
+        result = cpp_extract_quarter(input.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef Column days_in_month(
-    Column input, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Extract the number of days in the month.
@@ -372,12 +382,13 @@ cpdef Column days_in_month(
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_days_in_month(input.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(result), stream, mr)
+        result = cpp_days_in_month(input.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(result), _stream, mr)
 
 DatetimeComponent.__str__ = DatetimeComponent.__repr__
 RoundingFrequency.__str__ = RoundingFrequency.__repr__
diff --git a/python/pylibcudf/pylibcudf/experimental/_join_streams.pxd b/python/pylibcudf/pylibcudf/experimental/_join_streams.pxd
index db9ca865197..832d572b467 100644
--- a/python/pylibcudf/pylibcudf/experimental/_join_streams.pxd
+++ b/python/pylibcudf/pylibcudf/experimental/_join_streams.pxd
@@ -1,6 +1,5 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
-from rmm.pylibrmm.stream cimport Stream
 
-cpdef void join_streams(list streams, Stream stream)
+cpdef void join_streams(list streams, object stream)
diff --git a/python/pylibcudf/pylibcudf/experimental/_join_streams.pyi b/python/pylibcudf/pylibcudf/experimental/_join_streams.pyi
index 522239c6a80..c9c2ba79e36 100644
--- a/python/pylibcudf/pylibcudf/experimental/_join_streams.pyi
+++ b/python/pylibcudf/pylibcudf/experimental/_join_streams.pyi
@@ -1,6 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
-from rmm.pylibrmm.stream import Stream
+from pylibcudf.utils import CudaStreamLike
 
-def join_streams(streams: list[Stream], stream: Stream) -> None: ...
+def join_streams(
+    streams: list[CudaStreamLike], stream: CudaStreamLike
+) -> None: ...
diff --git a/python/pylibcudf/pylibcudf/experimental/_join_streams.pyx b/python/pylibcudf/pylibcudf/experimental/_join_streams.pyx
index 7f3d2f228fb..d9efcb19ed9 100644
--- a/python/pylibcudf/pylibcudf/experimental/_join_streams.pyx
+++ b/python/pylibcudf/pylibcudf/experimental/_join_streams.pyx
@@ -1,21 +1,22 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
+from cuda.bindings.cyruntime cimport cudaStream_t
 from libcpp.vector cimport vector
 
 from pylibcudf.libcudf.detail.utilities cimport stream_pool as cpp_stream_pool
+from pylibcudf.libcudf.detail.utilities.stream_pool cimport const_cudaStream_t
 from pylibcudf.libcudf.utilities.span cimport host_span
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
 from rmm.pylibrmm.stream cimport Stream
 
-ctypedef const cuda_stream_view const_cuda_stream_view
+from ..utils cimport _get_stream
 
 
 __all__ = ["join_streams"]
 
 
-cpdef void join_streams(list streams, Stream stream):
+cpdef void join_streams(list streams, object stream):
     """Synchronize a stream to an event on a set of streams.
 
     This function synchronizes the joined stream with the waited-on streams
@@ -42,15 +43,16 @@ cpdef void join_streams(list streams, Stream stream):
     >>> plc.experimental.join_streams([stream1, stream2], join_stream)
     >>> # ... continue work on join_stream ...
     """
-    cdef Stream c_stream = <Stream?>stream
-    cdef vector[cuda_stream_view] c_streams
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
+    cdef vector[cudaStream_t] c_streams
 
     c_streams.reserve(len(streams))
     for s in streams:
-        c_streams.push_back((<Stream?>s).view())
+        c_streams.push_back((<Stream>_get_stream(s)).view().value())
 
     with nogil:
         cpp_stream_pool.join_streams(
-            host_span[const_cuda_stream_view](c_streams.data(), c_streams.size()),
-            c_stream.view()
+            host_span[const_cudaStream_t](c_streams.data(), c_streams.size()),
+            _cs
         )
diff --git a/python/pylibcudf/pylibcudf/filling.pxd b/python/pylibcudf/pylibcudf/filling.pxd
index b90d567b2c2..acb92e0212a 100644
--- a/python/pylibcudf/pylibcudf/filling.pxd
+++ b/python/pylibcudf/pylibcudf/filling.pxd
@@ -1,7 +1,6 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from pylibcudf.libcudf.types cimport size_type
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .column cimport Column
@@ -17,7 +16,7 @@ cpdef Column fill(
     size_type begin,
     size_type end,
     Scalar value,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -26,21 +25,21 @@ cpdef void fill_in_place(
     size_type c_begin,
     size_type c_end,
     Scalar value,
-    Stream stream = *,
+    object stream = *,
 )
 
 cpdef Column sequence(
     size_type size,
     Scalar init,
     Scalar step,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef Table repeat(
     Table input_table,
     ColumnOrSize count,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -48,6 +47,6 @@ cpdef Column calendrical_month_sequence(
     size_type n,
     Scalar init,
     size_type months,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
diff --git a/python/pylibcudf/pylibcudf/filling.pyi b/python/pylibcudf/pylibcudf/filling.pyi
index a1023f8016c..2789ecd5aca 100644
--- a/python/pylibcudf/pylibcudf/filling.pyi
+++ b/python/pylibcudf/pylibcudf/filling.pyi
@@ -1,32 +1,33 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
-from rmm.pylibrmm.stream import Stream
-
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 def fill(
     destination: Column,
     begin: int,
     end: int,
     value: Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
 ) -> Column: ...
 def fill_in_place(
     destination: Column,
     begin: int,
     end: int,
     value: Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
 ) -> None: ...
 def sequence(
-    size: int, init: Scalar, step: Scalar, stream: Stream | None = None
+    size: int, init: Scalar, step: Scalar, stream: CudaStreamLike | None = None
 ) -> Column: ...
 def repeat(
-    input_table: Table, count: Column | int, stream: Stream | None = None
+    input_table: Table,
+    count: Column | int,
+    stream: CudaStreamLike | None = None,
 ) -> Table: ...
 def calendrical_month_sequence(
-    n: int, init: Scalar, months: int, stream: Stream | None = None
+    n: int, init: Scalar, months: int, stream: CudaStreamLike | None = None
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx
index 68e4862dfb8..ce6002eb24e 100644
--- a/python/pylibcudf/pylibcudf/filling.pyx
+++ b/python/pylibcudf/pylibcudf/filling.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -22,6 +22,7 @@ from .column cimport Column
 from .scalar cimport Scalar
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 __all__ = [
@@ -37,7 +38,7 @@ cpdef Column fill(
     size_type begin,
     size_type end,
     Scalar value,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
 
@@ -68,7 +69,8 @@ cpdef Column fill(
 
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -77,17 +79,17 @@ cpdef Column fill(
             begin,
             end,
             dereference((<Scalar> value).c_obj),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef void fill_in_place(
     Column destination,
     size_type begin,
     size_type end,
     Scalar value,
-    Stream stream=None,
+    object stream=None,
 ):
 
     """Fill destination column in place from begin to end with value.
@@ -112,7 +114,8 @@ cpdef void fill_in_place(
     None
     """
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
 
     cdef mutable_column_view c_destination = destination.mutable_view()
     with nogil:
@@ -121,7 +124,7 @@ cpdef void fill_in_place(
             begin,
             end,
             dereference(value.c_obj),
-            stream.view()
+            _cs
         )
     destination.set_null_count(c_destination.null_count())
 
@@ -129,7 +132,7 @@ cpdef Column sequence(
     size_type size,
     Scalar init,
     Scalar step,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a sequence column of size ``size`` with initial value ``init`` and step
@@ -157,7 +160,8 @@ cpdef Column sequence(
     cdef unique_ptr[column] result
     cdef size_type c_size = size
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -165,16 +169,16 @@ cpdef Column sequence(
             c_size,
             dereference(init.c_obj),
             dereference(step.c_obj),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Table repeat(
     Table input_table,
     ColumnOrSize count,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Repeat rows of a Table.
@@ -203,7 +207,8 @@ cpdef Table repeat(
 
     cdef unique_ptr[table] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if ColumnOrSize is Column:
@@ -211,7 +216,7 @@ cpdef Table repeat(
             result = cpp_repeat(
                 input_table.view(),
                 count.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     if ColumnOrSize is size_type:
@@ -219,17 +224,17 @@ cpdef Table repeat(
             result = cpp_repeat(
                 input_table.view(),
                 count,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
-    return Table.from_libcudf(move(result), stream, mr)
+    return Table.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column calendrical_month_sequence(
     size_type n,
     Scalar init,
     size_type months,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
 
@@ -256,7 +261,8 @@ cpdef Column calendrical_month_sequence(
 
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -264,7 +270,7 @@ cpdef Column calendrical_month_sequence(
             n,
             dereference(init.c_obj),
             months,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/groupby.pxd b/python/pylibcudf/pylibcudf/groupby.pxd
index b5654ff6df8..a46146a145a 100644
--- a/python/pylibcudf/pylibcudf/groupby.pxd
+++ b/python/pylibcudf/pylibcudf/groupby.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -19,7 +19,6 @@ from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport null_order, order
 
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 from .column cimport Column
 from .table cimport Table
@@ -46,31 +45,31 @@ cdef class GroupBy:
     cdef unique_ptr[vector[null_order]] _null_precedence
 
     cpdef tuple aggregate(
-        self, list requests, Stream stream=*, DeviceMemoryResource mr=*
+        self, list requests, object stream = *, DeviceMemoryResource mr=*
     )
-    cpdef tuple scan(self, list requests, Stream stream=*, DeviceMemoryResource mr=*)
+    cpdef tuple scan(self, list requests, object stream = *, DeviceMemoryResource mr=*)
     cpdef tuple shift(
         self,
         Table values,
         list offset,
         list fill_values,
-        Stream stream=*,
+        object stream = *,
         DeviceMemoryResource mr=*,
     )
     cpdef tuple replace_nulls(
         self,
         Table values,
         list replace_policies,
-        Stream stream=*,
+        object stream = *,
         DeviceMemoryResource mr=*,
     )
     cpdef tuple get_groups(
-        self, Table values=*, Stream stream=*, DeviceMemoryResource mr=*
+        self, Table values=*, object stream = *, DeviceMemoryResource mr=*
     )
 
     @staticmethod
     cdef tuple _parse_outputs(
         pair[unique_ptr[table], vector[aggregation_result]] c_res,
-        Stream stream,
+        object stream,
         DeviceMemoryResource mr,
     )
diff --git a/python/pylibcudf/pylibcudf/groupby.pyi b/python/pylibcudf/pylibcudf/groupby.pyi
index 75322706187..01c732175f4 100644
--- a/python/pylibcudf/pylibcudf/groupby.pyi
+++ b/python/pylibcudf/pylibcudf/groupby.pyi
@@ -1,8 +1,7 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.aggregation import Aggregation
 from pylibcudf.column import Column
@@ -10,6 +9,7 @@ from pylibcudf.replace import ReplacePolicy
 from pylibcudf.scalar import Scalar
 from pylibcudf.table import Table
 from pylibcudf.types import NullOrder, NullPolicy, Order, Sorted
+from pylibcudf.utils import CudaStreamLike
 
 class GroupByRequest:
     def __init__(
@@ -28,13 +28,13 @@ class GroupBy:
     def aggregate(
         self,
         requests: list[GroupByRequest],
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> tuple[Table, list[Table]]: ...
     def scan(
         self,
         requests: list[GroupByRequest],
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> tuple[Table, list[Table]]: ...
     def shift(
@@ -42,19 +42,19 @@ class GroupBy:
         values: Table,
         offset: list[int],
         fill_values: list[Scalar],
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> tuple[Table, Table]: ...
     def replace_nulls(
         self,
         value: Table,
         replace_policies: list[ReplacePolicy],
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> tuple[Table, Table]: ...
     def get_groups(
         self,
         values: Table | None = None,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> tuple[list[int], Table, Table]: ...
diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx
index 94a292996a0..4b2f842a360 100644
--- a/python/pylibcudf/pylibcudf/groupby.pyx
+++ b/python/pylibcudf/pylibcudf/groupby.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -28,6 +28,7 @@ from .column cimport Column
 from .table cimport Table
 from .types cimport null_order, null_policy, order, sorted
 from .utils cimport _as_vector, _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 __all__ = ["GroupBy", "GroupByRequest"]
@@ -141,12 +142,13 @@ cdef class GroupBy:
     @staticmethod
     cdef tuple _parse_outputs(
         pair[unique_ptr[table], vector[aggregation_result]] c_res,
-        Stream stream,
+        object stream,
         DeviceMemoryResource mr,
     ):
         # Convert libcudf aggregation/scan outputs into pylibcudf objects.
         # This function is for internal use only.
-        cdef Table group_keys = Table.from_libcudf(move(c_res.first), stream, mr)
+        cdef Stream _stream = <Stream>stream
+        cdef Table group_keys = Table.from_libcudf(move(c_res.first), _stream, mr)
 
         cdef int i, j
         cdef list results = []
@@ -155,13 +157,13 @@ cdef class GroupBy:
             inner_results = []
             for j in range(c_res.second[i].results.size()):
                 inner_results.append(
-                    Column.from_libcudf(move(c_res.second[i].results[j]), stream, mr)
+                    Column.from_libcudf(move(c_res.second[i].results[j]), _stream, mr)
                 )
             results.append(Table(inner_results))
         return group_keys, results
 
     cpdef tuple aggregate(
-        self, list requests, Stream stream=None, DeviceMemoryResource mr=None
+        self, list requests, object stream=None, DeviceMemoryResource mr=None
     ):
         """Compute aggregations on columns.
 
@@ -189,19 +191,20 @@ cdef class GroupBy:
             c_requests.push_back(move(request._to_libcudf_agg_request()))
 
         cdef pair[unique_ptr[table], vector[aggregation_result]] c_res
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
         # TODO: Need to capture C++ exceptions indicating that an invalid type was used.
         # We rely on libcudf to tell us this rather than checking the types beforehand
         # ourselves.
         with nogil:
             c_res = dereference(self.c_obj).aggregate(
-                c_requests, stream.view(), mr.get_mr()
+                c_requests, _cs, mr.get_mr()
             )
-        return GroupBy._parse_outputs(move(c_res), stream, mr)
+        return GroupBy._parse_outputs(move(c_res), _stream, mr)
 
     cpdef tuple scan(
-        self, list requests, Stream stream=None, DeviceMemoryResource mr=None
+        self, list requests, object stream=None, DeviceMemoryResource mr=None
     ):
         """Compute scans on columns.
 
@@ -229,18 +232,23 @@ cdef class GroupBy:
             c_requests.push_back(move(request._to_libcudf_scan_request()))
 
         cdef pair[unique_ptr[table], vector[aggregation_result]] c_res
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
         with nogil:
-            c_res = dereference(self.c_obj).scan(c_requests, stream.view(), mr.get_mr())
-        return GroupBy._parse_outputs(move(c_res), stream, mr)
+            c_res = dereference(self.c_obj).scan(
+                c_requests,
+                _cs,
+                mr.get_mr(),
+            )
+        return GroupBy._parse_outputs(move(c_res), _stream, mr)
 
     cpdef tuple shift(
         self,
         Table values,
         list offset,
         list fill_values,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None,
     ):
         """Compute shifts on columns.
@@ -269,26 +277,27 @@ cdef class GroupBy:
 
         cdef vector[size_type] c_offset = offset
         cdef pair[unique_ptr[table], unique_ptr[table]] c_res
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
         with nogil:
             c_res = dereference(self.c_obj).shift(
                 values.view(),
                 c_offset,
                 c_fill_values,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         return (
-            Table.from_libcudf(move(c_res.first), stream, mr),
-            Table.from_libcudf(move(c_res.second), stream, mr),
+            Table.from_libcudf(move(c_res.first), _stream, mr),
+            Table.from_libcudf(move(c_res.second), _stream, mr),
         )
 
     cpdef tuple replace_nulls(
         self,
         Table value,
         list replace_policies,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None,
     ):
         """Replace nulls in columns.
@@ -312,22 +321,23 @@ cdef class GroupBy:
         """
         cdef pair[unique_ptr[table], unique_ptr[table]] c_res
         cdef vector[replace_policy] c_replace_policies = replace_policies
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
         with nogil:
             c_res = dereference(self.c_obj).replace_nulls(
                 value.view(),
                 c_replace_policies,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         return (
-            Table.from_libcudf(move(c_res.first), stream, mr),
-            Table.from_libcudf(move(c_res.second), stream, mr),
+            Table.from_libcudf(move(c_res.first), _stream, mr),
+            Table.from_libcudf(move(c_res.second), _stream, mr),
         )
 
     cpdef tuple get_groups(
-        self, Table values=None, Stream stream=None, DeviceMemoryResource mr=None
+        self, Table values=None, object stream=None, DeviceMemoryResource mr=None
     ):
         """Get the grouped keys and values labels for each row.
 
@@ -352,24 +362,24 @@ cdef class GroupBy:
 
         cdef groups c_groups
         cdef table_view empty_view
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
         mr = _get_memory_resource(mr)
         if values:
             c_groups = dereference(self.c_obj).get_groups(
-                values.view(), stream.view(), mr.get_mr()
+                values.view(), _stream.view().value(), mr.get_mr()
             )
             return (
                 c_groups.offsets,
-                Table.from_libcudf(move(c_groups.keys), stream, mr),
-                Table.from_libcudf(move(c_groups.values), stream, mr),
+                Table.from_libcudf(move(c_groups.keys), _stream, mr),
+                Table.from_libcudf(move(c_groups.values), _stream, mr),
             )
         else:
             # c_groups.values is nullptr - call get_groups with empty table view
             c_groups = dereference(self.c_obj).get_groups(
-                empty_view, stream.view(), mr.get_mr()
+                empty_view, _stream.view().value(), mr.get_mr()
             )
             return (
                 c_groups.offsets,
-                Table.from_libcudf(move(c_groups.keys), stream, mr),
+                Table.from_libcudf(move(c_groups.keys), _stream, mr),
                 None,
             )
diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd
index 4febd6e4949..b824f2dbcb8 100644
--- a/python/pylibcudf/pylibcudf/hashing.pxd
+++ b/python/pylibcudf/pylibcudf/hashing.pxd
@@ -1,9 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stdint cimport uint32_t, uint64_t
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 from .column cimport Column
 from .table cimport Table
@@ -12,34 +11,34 @@ from .table cimport Table
 cpdef Column murmurhash3_x86_32(
     Table input,
     uint32_t seed=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
 cpdef Table murmurhash3_x64_128(
     Table input,
     uint64_t seed=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
 cpdef Column xxhash_32(
     Table input,
     uint32_t seed=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
 cpdef Column xxhash_64(
     Table input,
     uint64_t seed=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
-cpdef Column md5(Table input, Stream stream=*, DeviceMemoryResource mr=*)
-cpdef Column sha1(Table input, Stream stream=*, DeviceMemoryResource mr=*)
-cpdef Column sha224(Table input, Stream stream=*, DeviceMemoryResource mr=*)
-cpdef Column sha256(Table input, Stream stream=*, DeviceMemoryResource mr=*)
-cpdef Column sha384(Table input, Stream stream=*, DeviceMemoryResource mr=*)
-cpdef Column sha512(Table input, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef Column md5(Table input, object stream = *, DeviceMemoryResource mr=*)
+cpdef Column sha1(Table input, object stream = *, DeviceMemoryResource mr=*)
+cpdef Column sha224(Table input, object stream = *, DeviceMemoryResource mr=*)
+cpdef Column sha256(Table input, object stream = *, DeviceMemoryResource mr=*)
+cpdef Column sha384(Table input, object stream = *, DeviceMemoryResource mr=*)
+cpdef Column sha512(Table input, object stream = *, DeviceMemoryResource mr=*)
diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi
index 1b8d055368a..dae03796b9c 100644
--- a/python/pylibcudf/pylibcudf/hashing.pyi
+++ b/python/pylibcudf/pylibcudf/hashing.pyi
@@ -1,67 +1,67 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from typing import Final
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 LIBCUDF_DEFAULT_HASH_SEED: Final[int]
 
 def murmurhash3_x86_32(
     input: Table,
     seed: int = ...,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def murmurhash3_x64_128(
     input: Table,
     seed: int = ...,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def xxhash_32(
     input: Table,
     seed: int = ...,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def xxhash_64(
     input: Table,
     seed: int = ...,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def md5(
     input: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def sha1(
     input: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def sha224(
     input: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def sha256(
     input: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def sha384(
     input: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def sha512(
     input: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx
index d9db52720bf..941393cf949 100644
--- a/python/pylibcudf/pylibcudf/hashing.pyx
+++ b/python/pylibcudf/pylibcudf/hashing.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
@@ -24,6 +24,7 @@ from rmm.pylibrmm.stream cimport Stream
 from .column cimport Column
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "LIBCUDF_DEFAULT_HASH_SEED",
@@ -44,7 +45,7 @@ LIBCUDF_DEFAULT_HASH_SEED = DEFAULT_HASH_SEED
 cpdef Column murmurhash3_x86_32(
     Table input,
     uint32_t seed=DEFAULT_HASH_SEED,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the MurmurHash3 32-bit hash value of each row in the given table.
@@ -65,24 +66,25 @@ cpdef Column murmurhash3_x86_32(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_murmurhash3_x86_32(
             input.view(),
             seed,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table murmurhash3_x64_128(
     Table input,
     uint64_t seed=DEFAULT_HASH_SEED,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the MurmurHash3 64-bit hash value of each row in the given table.
@@ -103,24 +105,25 @@ cpdef Table murmurhash3_x64_128(
     """
     cdef unique_ptr[table] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_murmurhash3_x64_128(
             input.view(),
             seed,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column xxhash_32(
     Table input,
     uint32_t seed=DEFAULT_HASH_SEED,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the xxHash 32-bit hash value of each row in the given table.
@@ -142,24 +145,25 @@ cpdef Column xxhash_32(
 
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_xxhash_32(
             input.view(),
             seed,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column xxhash_64(
     Table input,
     uint64_t seed=DEFAULT_HASH_SEED,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the xxHash 64-bit hash value of each row in the given table.
@@ -181,23 +185,24 @@ cpdef Column xxhash_64(
 
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_xxhash_64(
             input.view(),
             seed,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column md5(
     Table input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the MD5 hash value of each row in the given table.
@@ -220,16 +225,17 @@ cpdef Column md5(
 
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_md5(input.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(c_result), stream, mr)
+        c_result = cpp_md5(input.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column sha1(
     Table input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the SHA-1 hash value of each row in the given table.
@@ -250,17 +256,18 @@ cpdef Column sha1(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_sha1(input.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(c_result), stream, mr)
+        c_result = cpp_sha1(input.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column sha224(
     Table input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the SHA-224 hash value of each row in the given table.
@@ -281,17 +288,18 @@ cpdef Column sha224(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_sha224(input.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(c_result), stream, mr)
+        c_result = cpp_sha224(input.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column sha256(
     Table input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the SHA-256 hash value of each row in the given table.
@@ -312,17 +320,18 @@ cpdef Column sha256(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_sha256(input.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(c_result), stream, mr)
+        c_result = cpp_sha256(input.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column sha384(
     Table input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the SHA-384 hash value of each row in the given table.
@@ -343,17 +352,18 @@ cpdef Column sha384(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_sha384(input.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(c_result), stream, mr)
+        c_result = cpp_sha384(input.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column sha512(
     Table input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the SHA-512 hash value of each row in the given table.
@@ -374,9 +384,10 @@ cpdef Column sha512(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_sha512(input.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(c_result), stream, mr)
+        c_result = cpp_sha512(input.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/interop.pxd b/python/pylibcudf/pylibcudf/interop.pxd
index dfa62233541..942b9e806bc 100644
--- a/python/pylibcudf/pylibcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/interop.pxd
@@ -1,12 +1,11 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.table cimport Table
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 cpdef Table from_dlpack(
-    object managed_tensor, Stream stream=*, DeviceMemoryResource mr=*
+    object managed_tensor, object stream = *, DeviceMemoryResource mr=*
 )
 
-cpdef object to_dlpack(Table input, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef object to_dlpack(Table input, object stream = *, DeviceMemoryResource mr=*)
diff --git a/python/pylibcudf/pylibcudf/interop.pyi b/python/pylibcudf/pylibcudf/interop.pyi
index 0c10d71ec4f..34fe9394f7d 100644
--- a/python/pylibcudf/pylibcudf/interop.pyi
+++ b/python/pylibcudf/pylibcudf/interop.pyi
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from collections.abc import Iterable, Mapping
@@ -8,12 +8,12 @@ from typing import Any, overload
 import pyarrow as pa
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.table import Table
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 @dataclass
 class ColumnMetadata:
@@ -33,14 +33,14 @@ def from_arrow(
     obj: pa.Array[Any],
     *,
     data_type: DataType | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 @overload
 def from_arrow(
     obj: pa.Table,
     *,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 @overload
@@ -67,11 +67,11 @@ def to_arrow(
 ) -> pa.Scalar[Any]: ...
 def from_dlpack(
     managed_tensor: Any,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def to_dlpack(
     input: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Any: ...
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index ffc14415470..23c47bb090f 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -23,6 +23,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
 from ._interop_helpers import ColumnMetadata
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 __all__ = [
@@ -35,7 +36,7 @@ __all__ = [
 
 
 cpdef Table from_dlpack(
-    object managed_tensor, Stream stream=None, DeviceMemoryResource mr=None
+    object managed_tensor, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Convert a DLPack DLTensor into a cudf table.
@@ -65,7 +66,8 @@ cpdef Table from_dlpack(
     if dlpack_tensor is NULL:
         raise ValueError("PyCapsule object contained a NULL pointer")
     PyCapsule_SetName(managed_tensor, "used_dltensor")
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     # Note: A copy is always performed when converting the dlpack
@@ -74,14 +76,14 @@ cpdef Table from_dlpack(
     # TODO: https://github.com/rapidsai/cudf/issues/10874
     # TODO: https://github.com/rapidsai/cudf/issues/10849
     with nogil:
-        c_result = cpp_from_dlpack(dlpack_tensor, stream.view(), mr.get_mr())
+        c_result = cpp_from_dlpack(dlpack_tensor, _cs, mr.get_mr())
 
-    cdef Table result = Table.from_libcudf(move(c_result), stream, mr)
+    cdef Table result = Table.from_libcudf(move(c_result), _stream, mr)
     dlpack_tensor.deleter(dlpack_tensor)
     return result
 
 
-cpdef object to_dlpack(Table input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef object to_dlpack(Table input, object stream=None, DeviceMemoryResource mr=None):
     """
     Convert a cudf table into a DLPack DLTensor.
 
@@ -109,11 +111,12 @@ cpdef object to_dlpack(Table input, Stream stream=None, DeviceMemoryResource mr=
                 "Input is required to have null count as zero."
             )
     cdef DLManagedTensor *dlpack_tensor
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        dlpack_tensor = cpp_to_dlpack(input.view(), stream.view(), mr.get_mr())
+        dlpack_tensor = cpp_to_dlpack(input.view(), _cs, mr.get_mr())
 
     return PyCapsule_New(
         dlpack_tensor,
diff --git a/python/pylibcudf/pylibcudf/io/avro.pxd b/python/pylibcudf/pylibcudf/io/avro.pxd
index d76f2c1e628..0e8cb7ee283 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pxd
+++ b/python/pylibcudf/pylibcudf/io/avro.pxd
@@ -1,6 +1,5 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
@@ -29,5 +28,5 @@ cdef class AvroReaderOptionsBuilder:
     cpdef AvroReaderOptions build(self)
 
 cpdef TableWithMetadata read_avro(
-    AvroReaderOptions options, Stream stream = *, DeviceMemoryResource mr=*
+    AvroReaderOptions options, object stream = *, DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyi b/python/pylibcudf/pylibcudf/io/avro.pyi
index d7b6c87d388..7e41c39a2be 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyi
+++ b/python/pylibcudf/pylibcudf/io/avro.pyi
@@ -1,9 +1,9 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.io.types import SourceInfo, TableWithMetadata
+from pylibcudf.utils import CudaStreamLike
 
 __all__ = ["AvroReaderOptions", "AvroReaderOptionsBuilder", "read_avro"]
 
@@ -21,6 +21,6 @@ class AvroReaderOptionsBuilder:
 
 def read_avro(
     options: AvroReaderOptions,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> TableWithMetadata: ...
diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
index 9c5e2c05b11..f2bd021cdde 100644
--- a/python/pylibcudf/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.string cimport string
@@ -6,6 +6,7 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
@@ -152,7 +153,7 @@ cdef class AvroReaderOptionsBuilder:
 
 cpdef TableWithMetadata read_avro(
     AvroReaderOptions options,
-    Stream stream = None,
+    object stream = None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -173,8 +174,9 @@ cpdef TableWithMetadata read_avro(
         Device memory resource used to allocate the returned table's device memory.
     """
     cdef Stream s = _get_stream(stream)
+    cdef cudaStream_t _cs = s.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
-        c_result = move(cpp_read_avro(options.c_obj, s.view(), mr.get_mr()))
+        c_result = move(cpp_read_avro(options.c_obj, _cs, mr.get_mr()))
 
     return TableWithMetadata.from_libcudf(c_result, s, mr)
diff --git a/python/pylibcudf/pylibcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/io/csv.pxd
index 2f138e3aaa1..4293452311d 100644
--- a/python/pylibcudf/pylibcudf/io/csv.pxd
+++ b/python/pylibcudf/pylibcudf/io/csv.pxd
@@ -1,10 +1,9 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp cimport bool
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from pylibcudf.io.types cimport SinkInfo, SourceInfo, TableWithMetadata
@@ -74,7 +73,7 @@ cdef class CsvReaderOptionsBuilder:
     cpdef CsvReaderOptions build(self)
 
 cpdef TableWithMetadata read_csv(
-    CsvReaderOptions options, Stream stream = *, DeviceMemoryResource mr=*
+    CsvReaderOptions options, object stream = *, DeviceMemoryResource mr=*
 )
 
 cdef class CsvWriterOptions:
@@ -98,6 +97,6 @@ cdef class CsvWriterOptionsBuilder:
     cpdef CsvWriterOptions build(self)
 
 
-cpdef void write_csv(CsvWriterOptions options, Stream stream = *)
+cpdef void write_csv(CsvWriterOptions options, object stream = *)
 
 cpdef bool is_supported_write_csv(DataType type)
diff --git a/python/pylibcudf/pylibcudf/io/csv.pyi b/python/pylibcudf/pylibcudf/io/csv.pyi
index ade964da509..41465b3ba43 100644
--- a/python/pylibcudf/pylibcudf/io/csv.pyi
+++ b/python/pylibcudf/pylibcudf/io/csv.pyi
@@ -4,7 +4,6 @@
 from typing import Self
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.io.types import (
     CompressionType,
@@ -15,6 +14,7 @@ from pylibcudf.io.types import (
 )
 from pylibcudf.table import Table
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 class CsvReaderOptions:
     def __init__(self): ...
@@ -61,10 +61,12 @@ class CsvReaderOptionsBuilder:
 
 def read_csv(
     options: CsvReaderOptions,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> TableWithMetadata: ...
-def write_csv(options: CsvWriterOptions, stream: Stream | None = None): ...
+def write_csv(
+    options: CsvWriterOptions, stream: CudaStreamLike | None = None
+): ...
 
 class CsvWriterOptions:
     def __init__(self): ...
diff --git a/python/pylibcudf/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx
index 749cd45fcb5..1c3ae9cb0bf 100644
--- a/python/pylibcudf/pylibcudf/io/csv.pyx
+++ b/python/pylibcudf/pylibcudf/io/csv.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
@@ -8,6 +8,7 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from pylibcudf.io.types cimport SourceInfo, SinkInfo, TableWithMetadata
@@ -672,7 +673,7 @@ cdef class CsvReaderOptionsBuilder:
 
 cpdef TableWithMetadata read_csv(
     CsvReaderOptions options,
-    Stream stream = None,
+    object stream = None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -694,9 +695,10 @@ cpdef TableWithMetadata read_csv(
     """
     cdef table_with_metadata c_result
     cdef Stream s = _get_stream(stream)
+    cdef cudaStream_t _cs = s.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
-        c_result = move(cpp_read_csv(options.c_obj, s.view(), mr.get_mr()))
+        c_result = move(cpp_read_csv(options.c_obj, _cs, mr.get_mr()))
 
     cdef TableWithMetadata tbl_meta = TableWithMetadata.from_libcudf(c_result, s, mr)
     return tbl_meta
@@ -882,7 +884,7 @@ cdef class CsvWriterOptionsBuilder:
 
 cpdef void write_csv(
     CsvWriterOptions options,
-    Stream stream = None,
+    object stream = None,
 ):
     """
     Write to CSV format.
@@ -900,8 +902,9 @@ cpdef void write_csv(
         CUDA stream used for device memory operations and kernel launches
     """
     cdef Stream s = _get_stream(stream)
+    cdef cudaStream_t _cs = s.view().value()
     with nogil:
-        cpp_write_csv(move(options.c_obj), s.view())
+        cpp_write_csv(move(options.c_obj), _cs)
 
 
 cpdef bool is_supported_write_csv(DataType type):
diff --git a/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pxd b/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pxd
index 298b36651c3..8c471831823 100644
--- a/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pxd
+++ b/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pxd
@@ -32,5 +32,5 @@ cdef class FileMetaData:
 
 cdef class HybridScanReader:
     cdef unique_ptr[cpp_hybrid_scan_reader] c_obj
-    cdef Stream stream
+    cdef Stream _stream
     cdef DeviceMemoryResource mr
diff --git a/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pyi b/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pyi
index 0f0429a66db..6f1fbc250d8 100644
--- a/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pyi
+++ b/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pyi
@@ -4,13 +4,13 @@
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.io.parquet import ParquetReaderOptions
 from pylibcudf.io.text import ByteRangeInfo
 from pylibcudf.io.types import TableWithMetadata
 from pylibcudf.span import Span
+from pylibcudf.utils import CudaStreamLike
 
 class UseDataPageMask(IntEnum):
     YES: int
@@ -44,7 +44,7 @@ class HybridScanReader:
         self,
         row_group_indices: list[int],
         options: ParquetReaderOptions,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
     ) -> list[int]: ...
     def secondary_filters_byte_ranges(
         self, row_group_indices: list[int], options: ParquetReaderOptions
@@ -54,20 +54,20 @@ class HybridScanReader:
         dictionary_page_data: list[Span],
         row_group_indices: list[int],
         options: ParquetReaderOptions,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
     ) -> list[int]: ...
     def filter_row_groups_with_bloom_filters(
         self,
         bloom_filter_data: list[Span],
         row_group_indices: list[int],
         options: ParquetReaderOptions,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
     ) -> list[int]: ...
     def build_row_mask_with_page_index_stats(
         self,
         row_group_indices: list[int],
         options: ParquetReaderOptions,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> Column: ...
     def filter_column_chunks_byte_ranges(
@@ -80,7 +80,7 @@ class HybridScanReader:
         row_mask: Column,
         mask_data_pages: UseDataPageMask,
         options: ParquetReaderOptions,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> TableWithMetadata: ...
     def payload_column_chunks_byte_ranges(
@@ -93,7 +93,7 @@ class HybridScanReader:
         row_mask: Column,
         mask_data_pages: UseDataPageMask,
         options: ParquetReaderOptions,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> TableWithMetadata: ...
     def all_column_chunks_byte_ranges(
@@ -104,7 +104,7 @@ class HybridScanReader:
         row_group_indices: list[int],
         column_chunk_data: list[Span],
         options: ParquetReaderOptions,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> TableWithMetadata: ...
     def setup_chunking_for_filter_columns(
@@ -116,7 +116,7 @@ class HybridScanReader:
         mask_data_pages: UseDataPageMask,
         column_chunk_data: list[Span],
         options: ParquetReaderOptions,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> None: ...
     def materialize_filter_columns_chunk(
@@ -132,7 +132,7 @@ class HybridScanReader:
         mask_data_pages: UseDataPageMask,
         column_chunk_data: list[Span],
         options: ParquetReaderOptions,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> None: ...
     def materialize_payload_columns_chunk(
diff --git a/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pyx b/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pyx
index beb28f6a1b0..4d25a05d362 100644
--- a/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pyx
+++ b/python/pylibcudf/pylibcudf/io/experimental/hybrid_scan.pyx
@@ -225,7 +225,7 @@ cdef class HybridScanReader:
         self,
         list row_group_indices,
         ParquetReaderOptions options,
-        Stream stream=None
+        object stream=None
     ):
         """Filter row groups using column chunk statistics.
 
@@ -243,7 +243,7 @@ cdef class HybridScanReader:
         list[int]
             Filtered row group indices
         """
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
         cdef vector[size_type] indices_vec = row_group_indices
         cdef vector[size_type] filtered = (
             self.c_obj.get()[0].filter_row_groups_with_stats(
@@ -251,7 +251,7 @@ cdef class HybridScanReader:
                     indices_vec.data(), indices_vec.size()
                 ),
                 options.c_obj,
-                stream.view()
+                _stream.view().value()
             )
         )
         return list(filtered)
@@ -295,7 +295,7 @@ cdef class HybridScanReader:
         list dictionary_page_data,
         list row_group_indices,
         ParquetReaderOptions options,
-        Stream stream=None
+        object stream=None
     ):
         """Filter row groups using column chunk dictionary pages.
 
@@ -316,7 +316,7 @@ cdef class HybridScanReader:
             Filtered row group indices
         """
         cdef vector[device_span[const_uint8_t]] spans_vec
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
         for span in dictionary_page_data:
             spans_vec.push_back(_get_device_span(span))
 
@@ -329,7 +329,7 @@ cdef class HybridScanReader:
                 ),
                 host_span[const_size_type](indices_vec.data(), indices_vec.size()),
                 options.c_obj,
-                stream.view()
+                _stream.view().value()
             )
         return list(filtered)
 
@@ -338,7 +338,7 @@ cdef class HybridScanReader:
         list bloom_filter_data,
         list row_group_indices,
         ParquetReaderOptions options,
-        Stream stream=None
+        object stream=None
     ):
         """Filter row groups using column chunk bloom filters.
 
@@ -359,7 +359,7 @@ cdef class HybridScanReader:
             Filtered row group indices
         """
         cdef vector[device_span[const_uint8_t]] spans_vec
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
         for span in bloom_filter_data:
             spans_vec.push_back(_get_device_span(span))
 
@@ -372,7 +372,7 @@ cdef class HybridScanReader:
                 ),
                 host_span[const_size_type](indices_vec.data(), indices_vec.size()),
                 options.c_obj,
-                stream.view()
+                _stream.view().value()
             )
         return list(filtered)
 
@@ -380,7 +380,7 @@ cdef class HybridScanReader:
         self,
         list row_group_indices,
         ParquetReaderOptions options,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None
     ):
         """Build a boolean column indicating surviving rows from page stats.
@@ -402,16 +402,16 @@ cdef class HybridScanReader:
             Boolean column indicating surviving rows
         """
         cdef vector[size_type] indices_vec = row_group_indices
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
         mr = _get_memory_resource(mr)
         cdef unique_ptr[column] c_result = \
             self.c_obj.get()[0].build_row_mask_with_page_index_stats(
                 host_span[const_size_type](indices_vec.data(), indices_vec.size()),
                 options.c_obj,
-                stream.view(),
+                _stream.view().value(),
                 mr.get_mr()
             )
-        return Column.from_libcudf(move(c_result), stream, mr)
+        return Column.from_libcudf(move(c_result), _stream, mr)
 
     def filter_column_chunks_byte_ranges(
         self,
@@ -447,7 +447,7 @@ cdef class HybridScanReader:
         Column row_mask,
         cpp_use_data_page_mask mask_data_pages,
         ParquetReaderOptions options,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None
     ):
         """Materialize filter columns and update the row mask.
@@ -477,7 +477,7 @@ cdef class HybridScanReader:
         cdef vector[size_type] indices_vec = row_group_indices
 
         cdef vector[device_span[const_uint8_t]] spans_vec
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
         mr = _get_memory_resource(mr)
         for span in column_chunk_data:
             spans_vec.push_back(_get_device_span(span))
@@ -492,10 +492,10 @@ cdef class HybridScanReader:
                 mask_view,
                 mask_data_pages,
                 options.c_obj,
-                stream.view(),
+                _stream.view().value(),
                 mr.get_mr()
             )
-        return TableWithMetadata.from_libcudf(c_result, stream, mr)
+        return TableWithMetadata.from_libcudf(c_result, _stream, mr)
 
     def payload_column_chunks_byte_ranges(
         self,
@@ -531,7 +531,7 @@ cdef class HybridScanReader:
         Column row_mask,
         cpp_use_data_page_mask mask_data_pages,
         ParquetReaderOptions options,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None
     ):
         """Materialize payload columns and apply the row mask.
@@ -561,7 +561,7 @@ cdef class HybridScanReader:
         cdef vector[size_type] indices_vec = row_group_indices
 
         cdef vector[device_span[const_uint8_t]] spans_vec
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
         mr = _get_memory_resource(mr)
         for span in column_chunk_data:
             spans_vec.push_back(_get_device_span(span))
@@ -576,10 +576,10 @@ cdef class HybridScanReader:
                 mask_view,
                 mask_data_pages,
                 options.c_obj,
-                stream.view(),
+                _stream.view().value(),
                 mr.get_mr()
             )
-        return TableWithMetadata.from_libcudf(c_result, stream, mr)
+        return TableWithMetadata.from_libcudf(c_result, _stream, mr)
 
     def all_column_chunks_byte_ranges(
         self,
@@ -613,7 +613,7 @@ cdef class HybridScanReader:
         list row_group_indices,
         list column_chunk_data,
         ParquetReaderOptions options,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None
     ):
         """Materialize all columns.
@@ -639,7 +639,7 @@ cdef class HybridScanReader:
         cdef vector[size_type] indices_vec = row_group_indices
 
         cdef vector[device_span[const_uint8_t]] spans_vec
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
         mr = _get_memory_resource(mr)
         for span in column_chunk_data:
             spans_vec.push_back(_get_device_span(span))
@@ -650,10 +650,10 @@ cdef class HybridScanReader:
                     <const_device_span_const_uint8_t*>spans_vec.data(), spans_vec.size()
                 ),
                 options.c_obj,
-                stream.view(),
+                _stream.view().value(),
                 mr.get_mr()
             )
-        return TableWithMetadata.from_libcudf(c_result, stream, mr)
+        return TableWithMetadata.from_libcudf(c_result, _stream, mr)
 
     def setup_chunking_for_filter_columns(
         self,
@@ -664,7 +664,7 @@ cdef class HybridScanReader:
         cpp_use_data_page_mask mask_data_pages,
         list column_chunk_data,
         ParquetReaderOptions options,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None
     ):
         """Setup chunking information for filter columns.
@@ -696,7 +696,7 @@ cdef class HybridScanReader:
         for span in column_chunk_data:
             spans_vec.push_back(_get_device_span(span))
 
-        self.stream = _get_stream(stream)
+        self._stream = _get_stream(stream)
         self.mr = _get_memory_resource(mr)
 
         cdef column_view mask_view = row_mask.view()
@@ -710,7 +710,7 @@ cdef class HybridScanReader:
                 <const_device_span_const_uint8_t*>spans_vec.data(), spans_vec.size()
             ),
             options.c_obj,
-            self.stream.view(),
+            self._stream.view().value(),
             self.mr.get_mr()
         )
 
@@ -735,7 +735,7 @@ cdef class HybridScanReader:
                 mask_view
             )
         return TableWithMetadata.from_libcudf(
-            c_result, self.stream, self.mr
+            c_result, self._stream, self.mr
         )
 
     def setup_chunking_for_payload_columns(
@@ -747,7 +747,7 @@ cdef class HybridScanReader:
         cpp_use_data_page_mask mask_data_pages,
         list column_chunk_data,
         ParquetReaderOptions options,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None
     ):
         """Setup chunking information for payload columns.
@@ -779,7 +779,7 @@ cdef class HybridScanReader:
         for span in column_chunk_data:
             spans_vec.push_back(_get_device_span(span))
 
-        self.stream = _get_stream(stream)
+        self._stream = _get_stream(stream)
         self.mr = _get_memory_resource(mr)
 
         cdef column_view mask_view = row_mask.view()
@@ -793,7 +793,7 @@ cdef class HybridScanReader:
                 <const_device_span_const_uint8_t*>spans_vec.data(), spans_vec.size()
             ),
             options.c_obj,
-            self.stream.view(),
+            self._stream.view().value(),
             self.mr.get_mr()
         )
 
@@ -818,7 +818,7 @@ cdef class HybridScanReader:
                 mask_view
             )
         return TableWithMetadata.from_libcudf(
-            c_result, self.stream, self.mr
+            c_result, self._stream, self.mr
         )
 
     def construct_row_group_passes(
diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd
index 96bc102ef0b..e46942ea14b 100644
--- a/python/pylibcudf/pylibcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/io/json.pxd
@@ -1,10 +1,9 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.vector cimport vector
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from pylibcudf.io.types cimport (
@@ -83,7 +82,7 @@ cdef class JsonReaderOptionsBuilder:
     cpdef build(self)
 
 cpdef TableWithMetadata read_json(
-    JsonReaderOptions options, Stream stream = *, DeviceMemoryResource mr = *
+    JsonReaderOptions options, object stream = *, DeviceMemoryResource mr = *
 )
 
 cpdef TableWithMetadata read_json_from_string_column(
@@ -93,7 +92,7 @@ cpdef TableWithMetadata read_json_from_string_column(
     list dtypes = *,
     compression_type compression = *,
     json_recovery_mode_t recovery_mode = *,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *)
 
 cdef class JsonWriterOptions:
@@ -117,13 +116,13 @@ cdef class JsonWriterOptionsBuilder:
     cpdef JsonWriterOptionsBuilder utf8_escaped(self, bool val)
     cpdef JsonWriterOptions build(self)
 
-cpdef void write_json(JsonWriterOptions options, Stream stream = *)
+cpdef void write_json(JsonWriterOptions options, object stream = *)
 
 cpdef bool is_supported_write_json(DataType type)
 
 cpdef tuple chunked_read_json(
     JsonReaderOptions options,
     int chunk_size= *,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
diff --git a/python/pylibcudf/pylibcudf/io/json.pyi b/python/pylibcudf/pylibcudf/io/json.pyi
index f19da874a0d..a03d8ef407c 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyi
+++ b/python/pylibcudf/pylibcudf/io/json.pyi
@@ -4,7 +4,6 @@ from collections.abc import Mapping
 from typing import Self, TypeAlias
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.io.types import (
@@ -17,6 +16,7 @@ from pylibcudf.io.types import (
 from pylibcudf.scalar import Scalar
 from pylibcudf.table import Table
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 ChildNameToTypeMap: TypeAlias = Mapping[str, ChildNameToTypeMap]
 
@@ -73,7 +73,7 @@ class JsonReaderOptionsBuilder:
 
 def read_json(
     options: JsonReaderOptions,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> TableWithMetadata: ...
 def read_json_from_string_column(
@@ -83,7 +83,7 @@ def read_json_from_string_column(
     dtypes: list | None = None,
     compression: CompressionType = CompressionType.NONE,
     recovery_mode: JSONRecoveryMode = JSONRecoveryMode.RECOVER_WITH_NULL,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> TableWithMetadata: ...
 
@@ -105,12 +105,12 @@ class JsonWriterOptionsBuilder:
     def build(self) -> JsonWriterOptions: ...
 
 def write_json(
-    options: JsonWriterOptions, stream: Stream | None = None
+    options: JsonWriterOptions, stream: CudaStreamLike | None = None
 ) -> None: ...
 def chunked_read_json(
     options: JsonReaderOptions,
     chunk_size: int = 100_000_000,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[list[Column], list[str], ChildNameToTypeMap]: ...
 def is_supported_write_json(type: DataType) -> bool: ...
diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx
index aa66c6fe5c2..1bce364fdd8 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyx
+++ b/python/pylibcudf/pylibcudf/io/json.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp cimport bool
 from libcpp.map cimport map
@@ -49,6 +49,7 @@ from pylibcudf.utils cimport _get_stream
 from cython.operator import dereference
 
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "chunked_read_json",
@@ -704,7 +705,7 @@ cdef class JsonReaderOptionsBuilder:
 cpdef tuple chunked_read_json(
     JsonReaderOptions options,
     int chunk_size=100_000_000,
-    Stream stream = None,
+    object stream = None,
     DeviceMemoryResource mr = None,
 ):
     """
@@ -735,6 +736,7 @@ cpdef tuple chunked_read_json(
     child_names = None
     i = 0
     cdef Stream s = _get_stream(stream)
+    cdef cudaStream_t _cs = s.view().value()
     mr = _get_memory_resource(mr)
     while True:
         options.enable_lines(True)
@@ -743,7 +745,7 @@ cpdef tuple chunked_read_json(
 
         try:
             with nogil:
-                c_result = move(cpp_read_json(options.c_obj, s.view(), mr.get_mr()))
+                c_result = move(cpp_read_json(options.c_obj, _cs, mr.get_mr()))
         except (ValueError, OverflowError):
             break
         if meta_names is None:
@@ -772,7 +774,7 @@ cpdef tuple chunked_read_json(
 
 cpdef TableWithMetadata read_json(
     JsonReaderOptions options,
-    Stream stream = None,
+    object stream = None,
     DeviceMemoryResource mr = None
 ):
     """
@@ -797,9 +799,10 @@ cpdef TableWithMetadata read_json(
     """
     cdef table_with_metadata c_result
     cdef Stream s = _get_stream(stream)
+    cdef cudaStream_t _cs = s.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
-        c_result = move(cpp_read_json(options.c_obj, s.view(), mr.get_mr()))
+        c_result = move(cpp_read_json(options.c_obj, _cs, mr.get_mr()))
 
     return TableWithMetadata.from_libcudf(c_result, s, mr)
 
@@ -810,7 +813,7 @@ cpdef TableWithMetadata read_json_from_string_column(
     list dtypes = None,
     compression_type compression = compression_type.NONE,
     json_recovery_mode_t recovery_mode = json_recovery_mode_t.RECOVER_WITH_NULL,
-    Stream stream = None,
+    object stream = None,
     DeviceMemoryResource mr = None
 ):
     """
@@ -852,7 +855,8 @@ cpdef TableWithMetadata read_json_from_string_column(
     cdef unique_ptr[column] c_join_string_column
     cdef column_contents c_contents
     cdef table_with_metadata c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     # Join the string column into a single string
@@ -862,7 +866,7 @@ cpdef TableWithMetadata read_json_from_string_column(
                 input.view(),
                 dereference(c_separator),
                 dereference(c_narep),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         )
@@ -870,7 +874,7 @@ cpdef TableWithMetadata read_json_from_string_column(
 
     # Create a new source from the joined string data
     cdef SourceInfo joined_source = SourceInfo(
-            [DeviceBuffer.c_from_unique_ptr(move(c_contents.data), stream, mr)])
+            [DeviceBuffer.c_from_unique_ptr(move(c_contents.data), _stream, mr)])
 
     # Create new options using the joined string as source
     cdef JsonReaderOptions options = (
@@ -886,9 +890,9 @@ cpdef TableWithMetadata read_json_from_string_column(
 
     # Read JSON from the joined string
     with nogil:
-        c_result = move(cpp_read_json(options.c_obj, stream.view(), mr.get_mr()))
+        c_result = move(cpp_read_json(options.c_obj, _cs, mr.get_mr()))
 
-    return TableWithMetadata.from_libcudf(c_result, stream, mr)
+    return TableWithMetadata.from_libcudf(c_result, _stream, mr)
 
 cdef class JsonWriterOptions:
     """
@@ -1090,7 +1094,7 @@ cdef class JsonWriterOptionsBuilder:
         return json_options
 
 
-cpdef void write_json(JsonWriterOptions options, Stream stream = None):
+cpdef void write_json(JsonWriterOptions options, object stream = None):
     """
     Writes a set of columns to JSON format.
 
@@ -1106,8 +1110,9 @@ cpdef void write_json(JsonWriterOptions options, Stream stream = None):
     None
     """
     cdef Stream s = _get_stream(stream)
+    cdef cudaStream_t _cs = s.view().value()
     with nogil:
-        cpp_write_json(options.c_obj, s.view())
+        cpp_write_json(options.c_obj, _cs)
 
 cpdef bool is_supported_write_json(DataType type):
     """Check if the dtype is supported for JSON writing
diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd
index 24221163917..72ad5aac534 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pxd
+++ b/python/pylibcudf/pylibcudf/io/orc.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libc.stdint cimport uint64_t, int64_t
 
@@ -9,7 +9,6 @@ from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from pylibcudf.io.types cimport (
@@ -65,7 +64,7 @@ cdef class OrcReaderOptionsBuilder:
     cpdef OrcReaderOptions build(self)
 
 cpdef TableWithMetadata read_orc(
-    OrcReaderOptions options, Stream stream = *, DeviceMemoryResource mr=*
+    OrcReaderOptions options, object stream = *, DeviceMemoryResource mr=*
 )
 
 cdef class OrcColumnStatistics:
@@ -89,7 +88,7 @@ cdef class ParsedOrcStatistics:
 
 cpdef ParsedOrcStatistics read_parsed_orc_statistics(
     SourceInfo source_info,
-    Stream stream=*
+    object stream = *
 )
 
 cdef class OrcWriterOptions:
@@ -110,7 +109,7 @@ cdef class OrcWriterOptionsBuilder:
     cpdef OrcWriterOptionsBuilder metadata(self, TableInputMetadata meta)
     cpdef OrcWriterOptions build(self)
 
-cpdef void write_orc(OrcWriterOptions options, Stream stream = *)
+cpdef void write_orc(OrcWriterOptions options, object stream = *)
 
 cdef class OrcChunkedWriter:
     cdef unique_ptr[orc_chunked_writer] c_obj
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyi b/python/pylibcudf/pylibcudf/io/orc.pyi
index dcf2b731bac..3cb6daff240 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyi
+++ b/python/pylibcudf/pylibcudf/io/orc.pyi
@@ -4,7 +4,6 @@
 from typing import Any, Self
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.io.types import (
     CompressionType,
@@ -16,6 +15,7 @@ from pylibcudf.io.types import (
 )
 from pylibcudf.table import Table
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 class OrcReaderOptions:
     def set_num_rows(self, nrows: int) -> None: ...
@@ -34,7 +34,7 @@ class OrcReaderOptionsBuilder:
 
 def read_orc(
     options: OrcReaderOptions,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> TableWithMetadata: ...
 
@@ -59,7 +59,7 @@ class ParsedOrcStatistics:
 
 def read_parsed_orc_statistics(
     source_info: SourceInfo,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
 ) -> ParsedOrcStatistics: ...
 
 class OrcWriterOptions:
@@ -79,7 +79,7 @@ class OrcWriterOptionsBuilder:
     def build(self) -> OrcWriterOptions: ...
 
 def write_orc(
-    options: OrcWriterOptions, stream: Stream | None = None
+    options: OrcWriterOptions, stream: CudaStreamLike | None = None
 ) -> None: ...
 def is_supported_read_orc(compression: CompressionType) -> bool: ...
 def is_supported_write_orc(compression: CompressionType) -> bool: ...
@@ -90,7 +90,7 @@ class OrcChunkedWriter:
     def write(self, table: Table) -> None: ...
     @staticmethod
     def from_options(
-        options: ChunkedOrcWriterOptions, stream: Stream | None = None
+        options: ChunkedOrcWriterOptions, stream: CudaStreamLike | None = None
     ) -> OrcChunkedWriter: ...
 
 class ChunkedOrcWriterOptions:
diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx
index 8c3687ec232..3a2fabc5683 100644
--- a/python/pylibcudf/pylibcudf/io/orc.pyx
+++ b/python/pylibcudf/pylibcudf/io/orc.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp cimport bool
 from libcpp.string cimport string
@@ -8,6 +8,7 @@ from libcpp.vector cimport vector
 import datetime
 
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from pylibcudf.io.types cimport SourceInfo, TableWithMetadata, SinkInfo
@@ -444,7 +445,7 @@ cdef class OrcReaderOptionsBuilder:
 
 
 cpdef TableWithMetadata read_orc(
-    OrcReaderOptions options, Stream stream = None, DeviceMemoryResource mr=None
+    OrcReaderOptions options, object stream = None, DeviceMemoryResource mr=None
 ):
     """
     Read from ORC format.
@@ -465,17 +466,17 @@ cpdef TableWithMetadata read_orc(
     """
     cdef table_with_metadata c_result
     cdef Stream s = _get_stream(stream)
+    cdef cudaStream_t _cs = s.view().value()
     mr = _get_memory_resource(mr)
-
     with nogil:
-        c_result = move(cpp_read_orc(options.c_obj, s.view(), mr.get_mr()))
+        c_result = move(cpp_read_orc(options.c_obj, _cs, mr.get_mr()))
 
     return TableWithMetadata.from_libcudf(c_result, s, mr)
 
 
 cpdef ParsedOrcStatistics read_parsed_orc_statistics(
     SourceInfo source_info,
-    Stream stream=None
+    object stream=None
 ):
     """
     Read ORC statistics from a source.
@@ -494,8 +495,9 @@ cpdef ParsedOrcStatistics read_parsed_orc_statistics(
     """
     cdef Stream s = _get_stream(stream)
     cdef parsed_orc_statistics parsed
+    cdef cudaStream_t _cs = s.view().value()
     with nogil:
-        parsed = cpp_read_parsed_orc_statistics(source_info.c_obj, s.view())
+        parsed = cpp_read_parsed_orc_statistics(source_info.c_obj, _cs)
     return ParsedOrcStatistics.from_libcudf(parsed)
 
 
@@ -667,7 +669,7 @@ cdef class OrcWriterOptionsBuilder:
         return orc_options
 
 
-cpdef void write_orc(OrcWriterOptions options, Stream stream = None):
+cpdef void write_orc(OrcWriterOptions options, object stream = None):
     """
     Write to ORC format.
 
@@ -688,8 +690,9 @@ cpdef void write_orc(OrcWriterOptions options, Stream stream = None):
     None
     """
     cdef Stream s = _get_stream(stream)
+    cdef cudaStream_t _cs = s.view().value()
     with nogil:
-        cpp_write_orc(move(options.c_obj), s.view())
+        cpp_write_orc(move(options.c_obj), _cs)
 
 
 cdef class OrcChunkedWriter:
@@ -721,7 +724,7 @@ cdef class OrcChunkedWriter:
             self.c_obj.get()[0].write(table.view())
 
     @staticmethod
-    def from_options(ChunkedOrcWriterOptions options, Stream stream = None):
+    def from_options(ChunkedOrcWriterOptions options, object stream = None):
         """
         Creates a chunked ORC writer from options
 
@@ -740,7 +743,8 @@ cdef class OrcChunkedWriter:
             OrcChunkedWriter
         )
         cdef Stream s = _get_stream(stream)
-        orc_writer.c_obj.reset(new orc_chunked_writer(options.c_obj, s.view()))
+        cdef cudaStream_t _cs = s.view().value()
+        orc_writer.c_obj.reset(new orc_chunked_writer(options.c_obj, _cs))
         return orc_writer
 
 
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
index d9350f77721..c98a90dd692 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -6,8 +6,8 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
+from rmm.pylibrmm.stream cimport Stream
 
 from pylibcudf.expressions cimport Expression
 
@@ -74,7 +74,7 @@ cdef class ParquetReaderOptionsBuilder:
 
 
 cdef class ChunkedParquetReader:
-    cdef readonly Stream stream
+    cdef Stream _stream
     cdef DeviceMemoryResource mr
     cdef unique_ptr[cpp_chunked_parquet_reader] reader
 
@@ -83,7 +83,7 @@ cdef class ChunkedParquetReader:
 
 
 cpdef read_parquet(
-    ParquetReaderOptions options, Stream stream = *, DeviceMemoryResource mr=*
+    ParquetReaderOptions options, object stream = *, DeviceMemoryResource mr=*
 )
 
 
@@ -180,7 +180,7 @@ cdef class ParquetWriterOptionsBuilder:
 
     cpdef ParquetWriterOptions build(self)
 
-cpdef memoryview write_parquet(ParquetWriterOptions options, Stream stream = *)
+cpdef memoryview write_parquet(ParquetWriterOptions options, object stream = *)
 
 cpdef bool is_supported_read_parquet(compression_type compression)
 
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyi b/python/pylibcudf/pylibcudf/io/parquet.pyi
index c0c31e22007..f0a092f63e0 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyi
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyi
@@ -5,7 +5,6 @@ from collections.abc import Mapping, Sequence
 from typing import Self
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.expressions import Expression
 from pylibcudf.io.types import (
@@ -20,6 +19,7 @@ from pylibcudf.io.types import (
 )
 from pylibcudf.table import Table
 from pylibcudf.types import TypeId
+from pylibcudf.utils import CudaStreamLike
 
 class ParquetReaderOptions:
     def __init__(self): ...
@@ -53,7 +53,7 @@ class ChunkedParquetReader:
     def __init__(
         self,
         options: ParquetReaderOptions,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         chunk_read_limit: int = 0,
         pass_read_limit: int = 1024000000,
     ) -> None: ...
@@ -62,7 +62,7 @@ class ChunkedParquetReader:
 
 def read_parquet(
     options: ParquetReaderOptions,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> TableWithMetadata: ...
 
@@ -101,7 +101,7 @@ class ParquetWriterOptionsBuilder:
     def build(self) -> ParquetWriterOptions: ...
 
 def write_parquet(
-    options: ParquetWriterOptions, stream: Stream | None = None
+    options: ParquetWriterOptions, stream: CudaStreamLike | None = None
 ) -> memoryview: ...
 def is_supported_read_parquet(compression: CompressionType) -> bool: ...
 def is_supported_write_parquet(compression: CompressionType) -> bool: ...
@@ -112,7 +112,8 @@ class ChunkedParquetWriter:
     def write(self, table: Table, partitions_info: object = None) -> None: ...
     @staticmethod
     def from_options(
-        options: ChunkedParquetWriterOptions, stream: Stream | None = None
+        options: ChunkedParquetWriterOptions,
+        stream: CudaStreamLike | None = None,
     ) -> ChunkedParquetWriter: ...
 
 class ChunkedParquetWriterOptions:
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index c4bad082304..86904513cfa 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -46,6 +46,7 @@ from pylibcudf.libcudf.io.types cimport (
 from pylibcudf.libcudf.types cimport size_type, type_id
 from pylibcudf.table cimport Table
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "ChunkedParquetReader",
@@ -507,20 +508,21 @@ cdef class ChunkedParquetReader:
     def __init__(
         self,
         ParquetReaderOptions options,
-        Stream stream = None,
+        object stream = None,
         DeviceMemoryResource mr = None,
         size_t chunk_read_limit=0,
         size_t pass_read_limit=1024000000,
     ):
-        self.stream = _get_stream(stream)
+        self._stream = _get_stream(stream)
         self.mr = _get_memory_resource(mr)
+        cdef cudaStream_t stream_view = self._stream.view().value()
         with nogil:
             self.reader.reset(
                 new cpp_chunked_parquet_reader(
                     chunk_read_limit,
                     pass_read_limit,
                     options.c_obj,
-                    self.stream.view(),
+                    stream_view,
                     self.mr.get_mr()
                 )
             )
@@ -560,11 +562,11 @@ cdef class ChunkedParquetReader:
         with nogil:
             c_result = move(self.reader.get()[0].read_chunk())
 
-        return TableWithMetadata.from_libcudf(c_result, self.stream, mr)
+        return TableWithMetadata.from_libcudf(c_result, self._stream, mr)
 
 
 cpdef read_parquet(
-    ParquetReaderOptions options, Stream stream = None, DeviceMemoryResource mr=None
+    ParquetReaderOptions options, object stream = None, DeviceMemoryResource mr=None
 ):
     """
     Read from Parquet format.
@@ -584,9 +586,10 @@ cpdef read_parquet(
         Device memory resource used to allocate the returned table's device memory.
     """
     cdef Stream s = _get_stream(stream)
+    cdef cudaStream_t _cs = s.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
-        c_result = move(cpp_read_parquet(options.c_obj, s.view(), mr.get_mr()))
+        c_result = move(cpp_read_parquet(options.c_obj, _cs, mr.get_mr()))
 
     return TableWithMetadata.from_libcudf(c_result, s, mr)
 
@@ -640,7 +643,7 @@ cdef class ChunkedParquetWriter:
             self.c_obj.get()[0].write(table.view(), partitions)
 
     @staticmethod
-    def from_options(ChunkedParquetWriterOptions options, Stream stream = None):
+    def from_options(ChunkedParquetWriterOptions options, object stream = None):
         """
         Creates a chunked Parquet writer from options
 
@@ -659,8 +662,9 @@ cdef class ChunkedParquetWriter:
             ChunkedParquetWriter
         )
         cdef Stream s = _get_stream(stream)
+        cdef cudaStream_t _cs = s.view().value()
         parquet_writer.c_obj.reset(
-            new cpp_chunked_parquet_writer(options.c_obj, s.view())
+            new cpp_chunked_parquet_writer(options.c_obj, _cs)
         )
         return parquet_writer
 
@@ -1235,7 +1239,7 @@ cdef class ParquetWriterOptionsBuilder:
         return parquet_options
 
 
-cpdef memoryview write_parquet(ParquetWriterOptions options, Stream stream = None):
+cpdef memoryview write_parquet(ParquetWriterOptions options, object stream = None):
     """
     Writes a set of columns to parquet format.
 
@@ -1255,9 +1259,9 @@ cpdef memoryview write_parquet(ParquetWriterOptions options, Stream stream = Non
     """
     cdef unique_ptr[vector[uint8_t]] c_result
     cdef Stream s = _get_stream(stream)
-
+    cdef cudaStream_t _cs = s.view().value()
     with nogil:
-        c_result = cpp_write_parquet(move(options.c_obj), s.view())
+        c_result = cpp_write_parquet(move(options.c_obj), _cs)
 
     return memoryview(HostBuffer.from_unique_ptr(move(c_result)))
 
diff --git a/python/pylibcudf/pylibcudf/io/text.pxd b/python/pylibcudf/pylibcudf/io/text.pxd
index 7623c8da26b..5276f9ffaba 100644
--- a/python/pylibcudf/pylibcudf/io/text.pxd
+++ b/python/pylibcudf/pylibcudf/io/text.pxd
@@ -1,10 +1,9 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from pylibcudf.column cimport Column
-from pylibcudf.io.types cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from pylibcudf.libcudf.io.text cimport parse_options, data_chunk_source, byte_range_info
 
@@ -23,7 +22,7 @@ cpdef Column multibyte_split(
     DataChunkSource source,
     str delimiter,
     ParseOptions options=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
diff --git a/python/pylibcudf/pylibcudf/io/text.pyi b/python/pylibcudf/pylibcudf/io/text.pyi
index 66406c94dd2..581e45c3194 100644
--- a/python/pylibcudf/pylibcudf/io/text.pyi
+++ b/python/pylibcudf/pylibcudf/io/text.pyi
@@ -1,10 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 class ByteRangeInfo:
     def __init__(self, offset: int, size: int) -> None: ...
@@ -35,6 +35,6 @@ def multibyte_split(
     source: DataChunkSource,
     delimiter: str,
     options: ParseOptions | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/io/text.pyx b/python/pylibcudf/pylibcudf/io/text.pyx
index 9fb220b0a37..be15701a4d8 100644
--- a/python/pylibcudf/pylibcudf/io/text.pyx
+++ b/python/pylibcudf/pylibcudf/io/text.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -9,10 +9,11 @@ from libcpp.utility cimport move
 
 from pylibcudf.column cimport Column
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
-from pylibcudf.io.types cimport Stream
+from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.io cimport text as cpp_text
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "ByteRangeInfo",
@@ -193,7 +194,7 @@ cpdef Column multibyte_split(
     DataChunkSource source,
     str delimiter,
     ParseOptions options=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -224,7 +225,8 @@ cpdef Column multibyte_split(
     cdef unique_ptr[column] c_result
     cdef unique_ptr[data_chunk_source] c_source = move(source.c_source)
     cdef string c_delimiter = delimiter.encode()
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if options is None:
@@ -237,8 +239,8 @@ cpdef Column multibyte_split(
             dereference(c_source),
             c_delimiter,
             c_options,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pxd b/python/pylibcudf/pylibcudf/io/timezone.pxd
index a2fa33d102d..9a12be928b2 100644
--- a/python/pylibcudf/pylibcudf/io/timezone.pxd
+++ b/python/pylibcudf/pylibcudf/io/timezone.pxd
@@ -1,11 +1,11 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from ..table cimport Table
-from .types cimport Stream
+
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 
 cpdef Table make_timezone_transition_table(
-    str tzif_dir, str timezone_name, Stream stream=*, DeviceMemoryResource mr=*
+    str tzif_dir, str timezone_name, object stream = *, DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyi b/python/pylibcudf/pylibcudf/io/timezone.pyi
index d83f68424b4..f87dda70f70 100644
--- a/python/pylibcudf/pylibcudf/io/timezone.pyi
+++ b/python/pylibcudf/pylibcudf/io/timezone.pyi
@@ -1,14 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 def make_timezone_transition_table(
     tzif_dir: str,
     timezone_name: str,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx
index 0416df1cf0b..033ed15a1ba 100644
--- a/python/pylibcudf/pylibcudf/io/timezone.pyx
+++ b/python/pylibcudf/pylibcudf/io/timezone.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -12,13 +12,14 @@ from pylibcudf.libcudf.table.table cimport table
 
 from ..utils cimport _get_stream, _get_memory_resource
 from ..table cimport Table
-from .types cimport Stream
+from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["make_timezone_transition_table"]
 
 cpdef Table make_timezone_transition_table(
-    str tzif_dir, str timezone_name, Stream stream=None, DeviceMemoryResource mr=None,
+    str tzif_dir, str timezone_name, object stream=None, DeviceMemoryResource mr=None,
 ):
     """
     Creates a transition table to convert ORC timestamps to UTC.
@@ -42,15 +43,16 @@ cpdef Table make_timezone_transition_table(
     cdef unique_ptr[table] c_result
     cdef string c_tzdir = tzif_dir.encode()
     cdef string c_tzname = timezone_name.encode()
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_make_timezone_transition_table(
             make_optional[string](c_tzdir),
             c_tzname,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd
index db7e2ad95c5..1e52f4faa05 100644
--- a/python/pylibcudf/pylibcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/io/types.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libc.stdint cimport uint8_t, int32_t
 
@@ -29,7 +29,6 @@ from pylibcudf.libcudf.utilities.span cimport host_span
 
 from pylibcudf.table cimport Table
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 cdef class PartitionInfo:
@@ -86,7 +85,7 @@ cdef class TableWithMetadata:
 
     @staticmethod
     cdef TableWithMetadata from_libcudf(
-        table_with_metadata& tbl, Stream stream, DeviceMemoryResource mr
+        table_with_metadata& tbl, object stream, DeviceMemoryResource mr
     )
 
 cdef class SourceInfo:
diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
index 1c4a7f49268..27c3bb47caf 100644
--- a/python/pylibcudf/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -33,7 +33,6 @@ from pylibcudf.libcudf.utilities.span cimport device_span, host_span
 from pylibcudf.span import is_span
 
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 import codecs
 import errno
@@ -396,7 +395,7 @@ cdef class TableWithMetadata:
     @staticmethod
     cdef TableWithMetadata from_libcudf(
         table_with_metadata& tbl_with_meta,
-        Stream stream,
+        object stream,
         DeviceMemoryResource mr
     ):
         """Create a Python TableWithMetadata from a libcudf table_with_metadata"""
diff --git a/python/pylibcudf/pylibcudf/join.pxd b/python/pylibcudf/pylibcudf/join.pxd
index 31a998029e3..f0b69a42621 100644
--- a/python/pylibcudf/pylibcudf/join.pxd
+++ b/python/pylibcudf/pylibcudf/join.pxd
@@ -1,10 +1,9 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf cimport join as cpp_join
 from pylibcudf.libcudf.types cimport null_equality
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .column cimport Column
@@ -16,7 +15,7 @@ cpdef tuple inner_join(
     Table left_keys,
     Table right_keys,
     null_equality nulls_equal,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -24,7 +23,7 @@ cpdef tuple left_join(
     Table left_keys,
     Table right_keys,
     null_equality nulls_equal,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -32,7 +31,7 @@ cpdef tuple full_join(
     Table left_keys,
     Table right_keys,
     null_equality nulls_equal,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -40,7 +39,7 @@ cpdef Column left_semi_join(
     Table left_keys,
     Table right_keys,
     null_equality nulls_equal,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -48,19 +47,19 @@ cpdef Column left_anti_join(
     Table left_keys,
     Table right_keys,
     null_equality nulls_equal,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Table cross_join(
-    Table left, Table right, Stream stream=*, DeviceMemoryResource mr=*
+    Table left, Table right, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef tuple conditional_inner_join(
     Table left,
     Table right,
     Expression binary_predicate,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -68,7 +67,7 @@ cpdef tuple conditional_left_join(
     Table left,
     Table right,
     Expression binary_predicate,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -76,7 +75,7 @@ cpdef tuple conditional_full_join(
     Table left,
     Table right,
     Expression binary_predicate,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -84,7 +83,7 @@ cpdef Column conditional_left_semi_join(
     Table left,
     Table right,
     Expression binary_predicate,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -92,7 +91,7 @@ cpdef Column conditional_left_anti_join(
     Table left,
     Table right,
     Expression binary_predicate,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -103,7 +102,7 @@ cpdef tuple mixed_inner_join(
     Table right_conditional,
     Expression binary_predicate,
     null_equality nulls_equal,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -114,7 +113,7 @@ cpdef tuple mixed_left_join(
     Table right_conditional,
     Expression binary_predicate,
     null_equality nulls_equal,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -125,7 +124,7 @@ cpdef tuple mixed_full_join(
     Table right_conditional,
     Expression binary_predicate,
     null_equality nulls_equal,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -136,7 +135,7 @@ cpdef Column mixed_left_semi_join(
     Table right_conditional,
     Expression binary_predicate,
     null_equality nulls_equal,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -147,7 +146,7 @@ cpdef Column mixed_left_anti_join(
     Table right_conditional,
     Expression binary_predicate,
     null_equality nulls_equal,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
diff --git a/python/pylibcudf/pylibcudf/join.pyi b/python/pylibcudf/pylibcudf/join.pyi
index 615eb914618..1cf86c7c704 100644
--- a/python/pylibcudf/pylibcudf/join.pyi
+++ b/python/pylibcudf/pylibcudf/join.pyi
@@ -4,12 +4,12 @@
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.expressions import Expression
 from pylibcudf.table import Table
 from pylibcudf.types import NullEquality
+from pylibcudf.utils import CudaStreamLike
 
 class SetAsBuildTable(IntEnum):
     LEFT = ...
@@ -19,76 +19,76 @@ def inner_join(
     left_keys: Table,
     right_keys: Table,
     nulls_equal: NullEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Column, Column]: ...
 def left_join(
     left_keys: Table,
     right_keys: Table,
     nulls_equal: NullEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Column, Column]: ...
 def full_join(
     left_keys: Table,
     right_keys: Table,
     nulls_equal: NullEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Column, Column]: ...
 def left_semi_join(
     left_keys: Table,
     right_keys: Table,
     nulls_equal: NullEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def left_anti_join(
     left_keys: Table,
     right_keys: Table,
     nulls_equal: NullEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def cross_join(
     left: Table,
     right: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def conditional_inner_join(
     left: Table,
     right: Table,
     binary_predicate: Expression,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Column, Column]: ...
 def conditional_left_join(
     left: Table,
     right: Table,
     binary_predicate: Expression,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Column, Column]: ...
 def conditional_full_join(
     left: Table,
     right: Table,
     binary_predicate: Expression,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Column, Column]: ...
 def conditional_left_semi_join(
     left: Table,
     right: Table,
     binary_predicate: Expression,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def conditional_left_anti_join(
     left: Table,
     right: Table,
     binary_predicate: Expression,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def mixed_inner_join(
@@ -98,7 +98,7 @@ def mixed_inner_join(
     right_conditional: Table,
     binary_predicate: Expression,
     nulls_equal: NullEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Column, Column]: ...
 def mixed_left_join(
@@ -108,7 +108,7 @@ def mixed_left_join(
     right_conditional: Table,
     binary_predicate: Expression,
     nulls_equal: NullEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Column, Column]: ...
 def mixed_full_join(
@@ -118,7 +118,7 @@ def mixed_full_join(
     right_conditional: Table,
     binary_predicate: Expression,
     nulls_equal: NullEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Column, Column]: ...
 def mixed_left_semi_join(
@@ -128,7 +128,7 @@ def mixed_left_semi_join(
     right_conditional: Table,
     binary_predicate: Expression,
     nulls_equal: NullEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def mixed_left_anti_join(
@@ -138,7 +138,7 @@ def mixed_left_anti_join(
     right_conditional: Table,
     binary_predicate: Expression,
     nulls_equal: NullEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 
@@ -148,17 +148,17 @@ class FilteredJoin:
         build: Table,
         compare_nulls: NullEquality,
         load_factor: float = ...,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
     ) -> None: ...
     def semi_join(
         self,
         probe: Table,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> Column: ...
     def anti_join(
         self,
         probe: Table,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx
index 61a321b27a8..78a44554dff 100644
--- a/python/pylibcudf/pylibcudf/join.pyx
+++ b/python/pylibcudf/pylibcudf/join.pyx
@@ -22,6 +22,7 @@ from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
 
 from pylibcudf.libcudf.join import set_as_build_table as SetAsBuildTable  # no-cython-lint  # noqa: F401, deprecated
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "conditional_full_join",
@@ -45,9 +46,10 @@ __all__ = [
 ]
 
 cdef Column _column_from_gather_map(
-    cpp_join.gather_map_type gather_map, Stream stream, DeviceMemoryResource mr
+    cpp_join.gather_map_type gather_map, object stream, DeviceMemoryResource mr
 ):
     # helper to convert a gather map to a Column
+    cdef Stream _stream = _get_stream(stream)
     return Column.from_libcudf(
         move(
             make_unique[column](
@@ -55,9 +57,7 @@ cdef Column _column_from_gather_map(
                 device_buffer(),
                 0
             )
-        ),
-        stream,
-        mr
+        ), _stream, mr
     )
 
 
@@ -65,7 +65,7 @@ cpdef tuple inner_join(
     Table left_keys,
     Table right_keys,
     null_equality nulls_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform an inner join between two tables.
@@ -89,16 +89,21 @@ cpdef tuple inner_join(
     """
     cdef cpp_join.gather_map_pair_type c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_join.inner_join(
-            left_keys.view(), right_keys.view(), nulls_equal, stream.view(), mr.get_mr()
+            left_keys.view(),
+            right_keys.view(),
+            nulls_equal,
+            _cs,
+            mr.get_mr()
         )
     return (
-        _column_from_gather_map(move(c_result.first), stream, mr),
-        _column_from_gather_map(move(c_result.second), stream, mr),
+        _column_from_gather_map(move(c_result.first), _stream, mr),
+        _column_from_gather_map(move(c_result.second), _stream, mr),
     )
 
 
@@ -106,7 +111,7 @@ cpdef tuple left_join(
     Table left_keys,
     Table right_keys,
     null_equality nulls_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a left join between two tables.
@@ -130,16 +135,21 @@ cpdef tuple left_join(
     """
     cdef cpp_join.gather_map_pair_type c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_join.left_join(
-            left_keys.view(), right_keys.view(), nulls_equal, stream.view(), mr.get_mr()
+            left_keys.view(),
+            right_keys.view(),
+            nulls_equal,
+            _cs,
+            mr.get_mr()
         )
     return (
-        _column_from_gather_map(move(c_result.first), stream, mr),
-        _column_from_gather_map(move(c_result.second), stream, mr),
+        _column_from_gather_map(move(c_result.first), _stream, mr),
+        _column_from_gather_map(move(c_result.second), _stream, mr),
     )
 
 
@@ -147,7 +157,7 @@ cpdef tuple full_join(
     Table left_keys,
     Table right_keys,
     null_equality nulls_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a full join between two tables.
@@ -171,16 +181,21 @@ cpdef tuple full_join(
     """
     cdef cpp_join.gather_map_pair_type c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_join.full_join(
-            left_keys.view(), right_keys.view(), nulls_equal, stream.view(), mr.get_mr()
+            left_keys.view(),
+            right_keys.view(),
+            nulls_equal,
+            _cs,
+            mr.get_mr()
         )
     return (
-        _column_from_gather_map(move(c_result.first), stream, mr),
-        _column_from_gather_map(move(c_result.second), stream, mr),
+        _column_from_gather_map(move(c_result.first), _stream, mr),
+        _column_from_gather_map(move(c_result.second), _stream, mr),
     )
 
 
@@ -188,7 +203,7 @@ cpdef Column left_semi_join(
     Table left_keys,
     Table right_keys,
     null_equality nulls_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a left semi join between two tables.
@@ -211,7 +226,8 @@ cpdef Column left_semi_join(
     """
     cdef cpp_join.gather_map_type c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     cdef unique_ptr[cpp_join.filtered_join] join_obj
@@ -221,22 +237,22 @@ cpdef Column left_semi_join(
             new cpp_join.filtered_join(
                 right_keys.view(),
                 nulls_equal,
-                stream.view()
+                _cs
             )
         )
         c_result = join_obj.get()[0].semi_join(
             left_keys.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return _column_from_gather_map(move(c_result), stream, mr)
+    return _column_from_gather_map(move(c_result), _stream, mr)
 
 
 cpdef Column left_anti_join(
     Table left_keys,
     Table right_keys,
     null_equality nulls_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a left anti join between two tables.
@@ -259,7 +275,8 @@ cpdef Column left_anti_join(
     """
     cdef cpp_join.gather_map_type c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     cdef unique_ptr[cpp_join.filtered_join] join_obj
@@ -269,19 +286,19 @@ cpdef Column left_anti_join(
             new cpp_join.filtered_join(
                 right_keys.view(),
                 nulls_equal,
-                stream.view()
+                _cs
             )
         )
         c_result = join_obj.get()[0].anti_join(
             left_keys.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return _column_from_gather_map(move(c_result), stream, mr)
+    return _column_from_gather_map(move(c_result), _stream, mr)
 
 
 cpdef Table cross_join(
-    Table left, Table right, Stream stream=None, DeviceMemoryResource mr=None
+    Table left, Table right, object stream=None, DeviceMemoryResource mr=None
 ):
     """Perform a cross join on two tables.
 
@@ -305,21 +322,22 @@ cpdef Table cross_join(
     """
     cdef unique_ptr[table] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         result = cpp_join.cross_join(
-            left.view(), right.view(), stream.view(), mr.get_mr()
+            left.view(), right.view(), _cs, mr.get_mr()
         )
-    return Table.from_libcudf(move(result), stream, mr)
+    return Table.from_libcudf(move(result), _stream, mr)
 
 
 cpdef tuple conditional_inner_join(
     Table left,
     Table right,
     Expression binary_predicate,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a conditional inner join between two tables.
@@ -344,7 +362,8 @@ cpdef tuple conditional_inner_join(
     cdef cpp_join.gather_map_pair_type c_result
     cdef optional[size_t] output_size
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -353,12 +372,12 @@ cpdef tuple conditional_inner_join(
             right.view(),
             dereference(binary_predicate.c_obj.get()),
             output_size,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
     return (
-        _column_from_gather_map(move(c_result.first), stream, mr),
-        _column_from_gather_map(move(c_result.second), stream, mr),
+        _column_from_gather_map(move(c_result.first), _stream, mr),
+        _column_from_gather_map(move(c_result.second), _stream, mr),
     )
 
 
@@ -366,7 +385,7 @@ cpdef tuple conditional_left_join(
     Table left,
     Table right,
     Expression binary_predicate,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a conditional left join between two tables.
@@ -391,7 +410,8 @@ cpdef tuple conditional_left_join(
     cdef cpp_join.gather_map_pair_type c_result
     cdef optional[size_t] output_size
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -400,12 +420,12 @@ cpdef tuple conditional_left_join(
             right.view(),
             dereference(binary_predicate.c_obj.get()),
             output_size,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
     return (
-        _column_from_gather_map(move(c_result.first), stream, mr),
-        _column_from_gather_map(move(c_result.second), stream, mr),
+        _column_from_gather_map(move(c_result.first), _stream, mr),
+        _column_from_gather_map(move(c_result.second), _stream, mr),
     )
 
 
@@ -413,7 +433,7 @@ cpdef tuple conditional_full_join(
     Table left,
     Table right,
     Expression binary_predicate,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a conditional full join between two tables.
@@ -437,7 +457,8 @@ cpdef tuple conditional_full_join(
     """
     cdef cpp_join.gather_map_pair_type c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -445,12 +466,12 @@ cpdef tuple conditional_full_join(
             left.view(),
             right.view(),
             dereference(binary_predicate.c_obj.get()),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
     return (
-        _column_from_gather_map(move(c_result.first), stream, mr),
-        _column_from_gather_map(move(c_result.second), stream, mr),
+        _column_from_gather_map(move(c_result.first), _stream, mr),
+        _column_from_gather_map(move(c_result.second), _stream, mr),
     )
 
 
@@ -458,7 +479,7 @@ cpdef Column conditional_left_semi_join(
     Table left,
     Table right,
     Expression binary_predicate,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a conditional left semi join between two tables.
@@ -482,7 +503,8 @@ cpdef Column conditional_left_semi_join(
     cdef cpp_join.gather_map_type c_result
     cdef optional[size_t] output_size
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -491,17 +513,17 @@ cpdef Column conditional_left_semi_join(
             right.view(),
             dereference(binary_predicate.c_obj.get()),
             output_size,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return _column_from_gather_map(move(c_result), stream, mr)
+    return _column_from_gather_map(move(c_result), _stream, mr)
 
 
 cpdef Column conditional_left_anti_join(
     Table left,
     Table right,
     Expression binary_predicate,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a conditional left anti join between two tables.
@@ -525,7 +547,8 @@ cpdef Column conditional_left_anti_join(
     cdef cpp_join.gather_map_type c_result
     cdef optional[size_t] output_size
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -534,10 +557,10 @@ cpdef Column conditional_left_anti_join(
             right.view(),
             dereference(binary_predicate.c_obj.get()),
             output_size,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return _column_from_gather_map(move(c_result), stream, mr)
+    return _column_from_gather_map(move(c_result), _stream, mr)
 
 
 cpdef tuple mixed_inner_join(
@@ -547,7 +570,7 @@ cpdef tuple mixed_inner_join(
     Table right_conditional,
     Expression binary_predicate,
     null_equality nulls_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a mixed inner join between two tables.
@@ -578,7 +601,8 @@ cpdef tuple mixed_inner_join(
     cdef cpp_join.gather_map_pair_type c_result
     cdef cpp_join.output_size_data_type empty_optional
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -590,12 +614,12 @@ cpdef tuple mixed_inner_join(
             dereference(binary_predicate.c_obj.get()),
             nulls_equal,
             empty_optional,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
     return (
-        _column_from_gather_map(move(c_result.first), stream, mr),
-        _column_from_gather_map(move(c_result.second), stream, mr),
+        _column_from_gather_map(move(c_result.first), _stream, mr),
+        _column_from_gather_map(move(c_result.second), _stream, mr),
     )
 
 
@@ -606,7 +630,7 @@ cpdef tuple mixed_left_join(
     Table right_conditional,
     Expression binary_predicate,
     null_equality nulls_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a mixed left join between two tables.
@@ -637,7 +661,8 @@ cpdef tuple mixed_left_join(
     cdef cpp_join.gather_map_pair_type c_result
     cdef cpp_join.output_size_data_type empty_optional
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -649,12 +674,12 @@ cpdef tuple mixed_left_join(
             dereference(binary_predicate.c_obj.get()),
             nulls_equal,
             empty_optional,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
     return (
-        _column_from_gather_map(move(c_result.first), stream, mr),
-        _column_from_gather_map(move(c_result.second), stream, mr),
+        _column_from_gather_map(move(c_result.first), _stream, mr),
+        _column_from_gather_map(move(c_result.second), _stream, mr),
     )
 
 
@@ -665,7 +690,7 @@ cpdef tuple mixed_full_join(
     Table right_conditional,
     Expression binary_predicate,
     null_equality nulls_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a mixed full join between two tables.
@@ -696,7 +721,8 @@ cpdef tuple mixed_full_join(
     cdef cpp_join.gather_map_pair_type c_result
     cdef cpp_join.output_size_data_type empty_optional
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -708,12 +734,12 @@ cpdef tuple mixed_full_join(
             dereference(binary_predicate.c_obj.get()),
             nulls_equal,
             empty_optional,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
     return (
-        _column_from_gather_map(move(c_result.first), stream, mr),
-        _column_from_gather_map(move(c_result.second), stream, mr),
+        _column_from_gather_map(move(c_result.first), _stream, mr),
+        _column_from_gather_map(move(c_result.second), _stream, mr),
     )
 
 
@@ -724,7 +750,7 @@ cpdef Column mixed_left_semi_join(
     Table right_conditional,
     Expression binary_predicate,
     null_equality nulls_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a mixed left semi join between two tables.
@@ -753,7 +779,8 @@ cpdef Column mixed_left_semi_join(
     """
     cdef cpp_join.gather_map_type c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -764,10 +791,10 @@ cpdef Column mixed_left_semi_join(
             right_conditional.view(),
             dereference(binary_predicate.c_obj.get()),
             nulls_equal,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return _column_from_gather_map(move(c_result), stream, mr)
+    return _column_from_gather_map(move(c_result), _stream, mr)
 
 
 cpdef Column mixed_left_anti_join(
@@ -777,7 +804,7 @@ cpdef Column mixed_left_anti_join(
     Table right_conditional,
     Expression binary_predicate,
     null_equality nulls_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a mixed left anti join between two tables.
@@ -806,7 +833,8 @@ cpdef Column mixed_left_anti_join(
     """
     cdef cpp_join.gather_map_type c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -817,10 +845,10 @@ cpdef Column mixed_left_anti_join(
             right_conditional.view(),
             dereference(binary_predicate.c_obj.get()),
             nulls_equal,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return _column_from_gather_map(move(c_result), stream, mr)
+    return _column_from_gather_map(move(c_result), _stream, mr)
 
 
 cdef class FilteredJoin:
@@ -841,7 +869,7 @@ cdef class FilteredJoin:
         Table build,
         null_equality compare_nulls,
         double load_factor=0.5,
-        Stream stream=None,
+        object stream=None,
     ):
         """
         Construct a filtered hash join object for subsequent probe calls.
@@ -858,7 +886,8 @@ cdef class FilteredJoin:
         stream : Stream, optional
             CUDA stream used for device memory operations and kernel launches.
         """
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
 
         with nogil:
             self.c_obj.reset(
@@ -866,14 +895,14 @@ cdef class FilteredJoin:
                     build.view(),
                     compare_nulls,
                     load_factor,
-                    stream.view()
+                    _cs
                 )
             )
 
     def semi_join(
         self,
         Table probe,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None,
     ):
         """
@@ -898,21 +927,22 @@ cdef class FilteredJoin:
         """
         cdef cpp_join.gather_map_type c_result
 
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
 
         with nogil:
             c_result = self.c_obj.get()[0].semi_join(
                 probe.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
-        return _column_from_gather_map(move(c_result), stream, mr)
+        return _column_from_gather_map(move(c_result), _stream, mr)
 
     def anti_join(
         self,
         Table probe,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None,
     ):
         """
@@ -937,13 +967,14 @@ cdef class FilteredJoin:
         """
         cdef cpp_join.gather_map_type c_result
 
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
 
         with nogil:
             c_result = self.c_obj.get()[0].anti_join(
                 probe.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
-        return _column_from_gather_map(move(c_result), stream, mr)
+        return _column_from_gather_map(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/json.pxd b/python/pylibcudf/pylibcudf/json.pxd
index 5489fa26ee8..47cf3b37c63 100644
--- a/python/pylibcudf/pylibcudf/json.pxd
+++ b/python/pylibcudf/pylibcudf/json.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
@@ -6,7 +6,6 @@ from pylibcudf.libcudf.json cimport get_json_object_options
 from pylibcudf.scalar cimport Scalar
 
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cdef class GetJsonObjectOptions:
@@ -17,6 +16,6 @@ cpdef Column get_json_object(
     Column col,
     Scalar json_path,
     GetJsonObjectOptions options=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/json.pyi b/python/pylibcudf/pylibcudf/json.pyi
index fa6bb08d510..a60bcb36f26 100644
--- a/python/pylibcudf/pylibcudf/json.pyi
+++ b/python/pylibcudf/pylibcudf/json.pyi
@@ -1,11 +1,11 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 class GetJsonObjectOptions:
     def __init__(
@@ -26,6 +26,6 @@ def get_json_object(
     col: Column,
     json_path: Scalar,
     options: GetJsonObjectOptions | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/json.pyx b/python/pylibcudf/pylibcudf/json.pyx
index b50bd4e7714..a470f6a1cb3 100644
--- a/python/pylibcudf/pylibcudf/json.pyx
+++ b/python/pylibcudf/pylibcudf/json.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -15,6 +15,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["GetJsonObjectOptions", "get_json_object"]
 
@@ -120,7 +121,7 @@ cpdef Column get_json_object(
     Column col,
     Scalar json_path,
     GetJsonObjectOptions options=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -155,7 +156,8 @@ cpdef Column get_json_object(
         options = GetJsonObjectOptions()
 
     cdef cpp_json.get_json_object_options c_options = options.options
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -163,8 +165,8 @@ cpdef Column get_json_object(
             col.view(),
             dereference(c_json_path),
             c_options,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd
index fc93568ed7c..0d8f02d48ce 100644
--- a/python/pylibcudf/pylibcudf/labeling.pxd
+++ b/python/pylibcudf/pylibcudf/labeling.pxd
@@ -1,11 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp cimport bool
 from pylibcudf.libcudf.labeling cimport inclusive
 
 from .column cimport Column
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 
@@ -15,6 +14,6 @@ cpdef Column label_bins(
     inclusive left_inclusive,
     Column right_edges,
     inclusive right_inclusive,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/labeling.pyi b/python/pylibcudf/pylibcudf/labeling.pyi
index e9ff5c97f0b..272edd43f5f 100644
--- a/python/pylibcudf/pylibcudf/labeling.pyi
+++ b/python/pylibcudf/pylibcudf/labeling.pyi
@@ -1,12 +1,12 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 class Inclusive(IntEnum):
     YES = ...
@@ -18,6 +18,6 @@ def label_bins(
     left_inclusive: Inclusive,
     right_edges: Column,
     right_inclusive: Inclusive,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx
index 878390543cb..e3a052f7cb8 100644
--- a/python/pylibcudf/pylibcudf/labeling.pyx
+++ b/python/pylibcudf/pylibcudf/labeling.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -14,6 +14,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .column cimport Column
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["Inclusive", "label_bins"]
 
@@ -23,7 +24,7 @@ cpdef Column label_bins(
     inclusive left_inclusive,
     Column right_edges,
     inclusive right_inclusive,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Labels elements based on membership in the specified bins.
@@ -54,7 +55,8 @@ cpdef Column label_bins(
         according to the specified bins.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -64,10 +66,10 @@ cpdef Column label_bins(
             left_inclusive,
             right_edges.view(),
             right_inclusive,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 Inclusive.__str__ = Inclusive.__repr__
diff --git a/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
index 7ec2c6fe31f..303b112f71e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
@@ -10,7 +10,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport data_type
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -57,7 +57,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         const column_view& rhs,
         binary_operator op,
         data_type output_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -66,7 +66,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         const scalar& rhs,
         binary_operator op,
         data_type output_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -75,7 +75,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         const column_view& rhs,
         binary_operator op,
         data_type output_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -84,7 +84,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         const column_view& rhs,
         const string& op,
         data_type output_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
index daefd24fb7b..b22eeb1dd40 100644
--- a/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.column.column_view cimport (
 from pylibcudf.libcudf.types cimport data_type, size_type
 
 from rmm.librmm.device_buffer cimport device_buffer
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -25,13 +25,13 @@ cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil:
         column() except +libcudf_exception_handler
         column(
             const column& other,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
         column(
             column_view view,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
index 5e17d3b89bd..f8cf3b38ccb 100644
--- a/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
@@ -13,7 +13,7 @@ from pylibcudf.libcudf.types cimport (
 )
 
 from rmm.librmm.device_buffer cimport device_buffer
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -22,7 +22,7 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
         data_type type,
         size_type size,
         mask_state state,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -31,7 +31,7 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
         size_type size,
         device_buffer mask,
         size_type null_count,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -39,7 +39,7 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
         data_type type,
         size_type size,
         mask_state state,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] make_fixed_point_column(
@@ -47,14 +47,14 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
         size_type size,
         device_buffer mask,
         size_type null_count,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] make_timestamp_column(
         data_type type,
         size_type size,
         mask_state state,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] make_timestamp_column(
@@ -62,14 +62,14 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
         size_type size,
         device_buffer mask,
         size_type null_count,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] make_duration_column(
         data_type type,
         size_type size,
         mask_state state,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] make_duration_column(
@@ -77,14 +77,14 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
         size_type size,
         device_buffer mask,
         size_type null_count,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] make_fixed_width_column(
         data_type type,
         size_type size,
         mask_state state,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] make_fixed_width_column(
@@ -92,27 +92,27 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
         size_type size,
         device_buffer mask,
         size_type null_count,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] make_column_from_scalar(
         const scalar& s,
         size_type size,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] make_dictionary_from_scalar(
         const scalar& s,
         size_type size,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] make_dictionary_column(
         unique_ptr[column] keys_column,
         unique_ptr[column] indices_column,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] make_empty_column(
diff --git a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
index 272f452a0a0..53cadee79c9 100644
--- a/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.table.table cimport table, table_view
 from pylibcudf.libcudf.utilities.span cimport host_span
 
 from rmm.librmm.device_buffer cimport device_buffer
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -24,11 +24,11 @@ cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[column] concatenate(
         const vector[column_view] columns,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[table] concatenate(
         const vector[table_view] tables,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
index 9d839835465..dd439d0d01d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
@@ -10,7 +10,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.libcudf.utilities.span cimport device_span
 from rmm.librmm.device_buffer cimport device_buffer
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -32,7 +32,7 @@ cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
         unique_ptr[chunked_pack] create(
             const table_view & input,
             size_t user_buffer_size,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref temp_mr,
         ) except +libcudf_exception_handler
 
@@ -43,13 +43,13 @@ cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
     cdef vector[contiguous_split_result] contiguous_split (
         table_view input_table,
         vector[size_type] splits,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef packed_columns pack (
         const table_view& input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/copying.pxd b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
index 2c3741342e9..36c95fa777c 100644
--- a/python/pylibcudf/pylibcudf/libcudf/copying.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
@@ -17,7 +17,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
 
 from rmm.librmm.device_buffer cimport device_buffer
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 ctypedef const scalar constscalar
@@ -31,7 +31,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const table_view& source_table,
         const column_view& gather_map,
         out_of_bounds_policy policy,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -39,7 +39,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const column_view& input,
         size_type offset,
         const scalar& fill_values,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -47,7 +47,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const table_view& source_table,
         const column_view& scatter_map,
         const table_view& target_table,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -55,7 +55,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const vector[reference_wrapper[constscalar]]& source_scalars,
         const column_view& indices,
         const table_view& target,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -71,7 +71,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         mask_allocation_policy policy,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -79,7 +79,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const column_view& input_column,
         size_type size,
         mask_allocation_policy policy,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -93,7 +93,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin,
-        cuda_stream_view stream
+        cudaStream_t stream
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_range (
@@ -102,39 +102,39 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef vector[column_view] slice (
         const column_view& input_column,
         vector[size_type] indices,
-        cuda_stream_view stream
+        cudaStream_t stream
     ) except +libcudf_exception_handler
 
     cdef vector[table_view] slice (
         const table_view& input_table,
         vector[size_type] indices,
-        cuda_stream_view stream
+        cudaStream_t stream
     ) except +libcudf_exception_handler
 
     cdef vector[column_view] split (
         const column_view& input_column,
         vector[size_type] splits,
-        cuda_stream_view stream
+        cudaStream_t stream
     ) except +libcudf_exception_handler
 
     cdef vector[table_view] split (
         const table_view& input_table,
         vector[size_type] splits,
-        cuda_stream_view stream
+        cudaStream_t stream
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const column_view& rhs,
         const column_view& boolean_mask,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -142,7 +142,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const scalar& lhs,
         const column_view& rhs,
         const column_view& boolean_mask,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -150,7 +150,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const column_view& lhs,
         const scalar& rhs,
         const column_view boolean_mask,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -158,7 +158,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const scalar& lhs,
         const scalar& rhs,
         const column_view boolean_mask,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -166,7 +166,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const table_view& input,
         const table_view& target,
         const column_view& boolean_mask,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -174,14 +174,14 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const vector[reference_wrapper[constscalar]]& input,
         const table_view& target,
         const column_view& boolean_mask,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[scalar] get_element (
         const column_view& input,
         size_type index,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
index a14932f8910..7db66dc1070 100644
--- a/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
@@ -7,7 +7,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -27,7 +27,7 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
     cdef unique_ptr[column] extract_datetime_component(
         const column_view& column,
         datetime_component component,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -42,54 +42,54 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
 
     cdef unique_ptr[column] ceil_datetimes(
         const column_view& column, rounding_frequency freq,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[column] floor_datetimes(
         const column_view& column, rounding_frequency freq,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[column] round_datetimes(
         const column_view& column, rounding_frequency freq,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] add_calendrical_months(
         const column_view& timestamps,
         const column_view& months,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[column] add_calendrical_months(
         const column_view& timestamps,
         const scalar& months,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[column] day_of_year(
         const column_view& column,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[column] is_leap_year(
         const column_view& column,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[column] last_day_of_month(
         const column_view& column,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[column] extract_quarter(
         const column_view& column,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[column] days_in_month(
         const column_view& column,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/detail/utilities/stream_pool.pxd b/python/pylibcudf/pylibcudf/libcudf/detail/utilities/stream_pool.pxd
index 7aea4aafcd1..399a868db71 100644
--- a/python/pylibcudf/pylibcudf/libcudf/detail/utilities/stream_pool.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/detail/utilities/stream_pool.pxd
@@ -1,14 +1,31 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
+from cuda.bindings.cyruntime cimport cudaStream_t
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.utilities.span cimport host_span
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+ctypedef const cudaStream_t const_cudaStream_t
 
 
-cdef extern from "cudf/detail/utilities/stream_pool.hpp" namespace "cudf::detail" nogil:
-    cdef void join_streams(
-        host_span[const cuda_stream_view] streams,
-        cuda_stream_view stream
+cdef extern from * nogil:
+    """
+    #include <cudf/detail/utilities/stream_pool.hpp>
+    #include <cudf/utilities/span.hpp>
+    #include <rmm/cuda_stream_view.hpp>
+    #include <vector>
+
+    namespace {
+    void join_streams_wrapper(
+        cudf::host_span<cudaStream_t const> streams,
+        cudaStream_t stream
+    ) {
+        std::vector<rmm::cuda_stream_view> stream_views(streams.begin(), streams.end());
+        cudf::detail::join_streams(stream_views, stream);
+    }
+    }
+    """
+    cdef void join_streams "join_streams_wrapper"(
+        host_span[const_cudaStream_t] streams,
+        cudaStream_t stream
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/distinct_count.pxd b/python/pylibcudf/pylibcudf/libcudf/distinct_count.pxd
index 5707f34f578..2cbf79c0c17 100644
--- a/python/pylibcudf/pylibcudf/libcudf/distinct_count.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/distinct_count.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.types cimport (
     null_policy,
     size_type,
 )
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 cdef extern from "cudf/reduction/distinct_count.hpp" namespace "cudf" nogil:
@@ -17,9 +17,9 @@ cdef extern from "cudf/reduction/distinct_count.hpp" namespace "cudf" nogil:
         column_view column,
         null_policy null_handling,
         nan_policy nan_handling,
-        cuda_stream_view stream) except +libcudf_exception_handler
+        cudaStream_t stream) except +libcudf_exception_handler
 
     cdef size_type distinct_count(
         table_view source_table,
         null_equality nulls_equal,
-        cuda_stream_view stream) except +libcudf_exception_handler
+        cudaStream_t stream) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/filling.pxd
index ac969cb8822..e9470a828a7 100644
--- a/python/pylibcudf/pylibcudf/libcudf/filling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/filling.pxd
@@ -12,7 +12,7 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -22,7 +22,7 @@ cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
         size_type begin,
         size_type end,
         const scalar & value,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -31,20 +31,20 @@ cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
         size_type begin,
         size_type end,
         const scalar & value,
-        cuda_stream_view stream
+        cudaStream_t stream
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] repeat(
         const table_view & input,
         const column_view & count,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] repeat(
         const table_view & input,
         size_type count,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -52,7 +52,7 @@ cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
         size_type size,
         const scalar & init,
         const scalar & step,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -60,6 +60,6 @@ cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
         size_type n,
         const scalar& init,
         size_type months,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
index 5ba69a12290..b5ba1031813 100644
--- a/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
@@ -24,7 +24,7 @@ from pylibcudf.libcudf.types cimport (
     sorted,
 )
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 # workaround for https://github.com/cython/cython/issues/3885
@@ -67,7 +67,7 @@ cdef extern from "cudf/groupby.hpp" \
             vector[aggregation_result]
         ] aggregate(
             const vector[aggregation_request]& requests,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
@@ -76,7 +76,7 @@ cdef extern from "cudf/groupby.hpp" \
             vector[aggregation_result]
         ] scan(
             const vector[scan_request]& requests,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
@@ -87,19 +87,19 @@ cdef extern from "cudf/groupby.hpp" \
             const table_view values,
             const vector[size_type] offset,
             const vector[reference_wrapper[constscalar]] fill_values,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
         groups get_groups(
             table_view values,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
         pair[unique_ptr[table], unique_ptr[table]] replace_nulls(
             const table_view& values,
             const vector[replace_policy] replace_policy,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
index 380afc96c58..9610fa2a09f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
@@ -7,7 +7,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -15,64 +15,64 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
     cdef unique_ptr[column] murmurhash3_x86_32(
         const table_view& input,
         const uint32_t seed,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] murmurhash3_x64_128(
         const table_view& input,
         const uint64_t seed,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] md5(
         const table_view& input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] sha1(
         const table_view& input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] sha224(
         const table_view& input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] sha256(
         const table_view& input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] sha384(
         const table_view& input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] sha512(
         const table_view& input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] xxhash_32(
         const table_view& input,
         const uint32_t seed,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] xxhash_64(
         const table_view& input,
         const uint64_t seed,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
index b09524a257b..78fc455dd35 100644
--- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -12,7 +12,7 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -40,13 +40,13 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
     cdef unique_ptr[table] from_dlpack(
         const DLManagedTensor* managed_tensor,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     DLManagedTensor* to_dlpack(
         const table_view& input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -65,18 +65,18 @@ cdef extern from "cudf/interop.hpp" namespace "cudf::interop" \
         arrow_column(
             ArrowSchema&& schema,
             ArrowArray&& array,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
         arrow_column(
             ArrowSchema&& schema,
             ArrowDeviceArray&& array,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
         arrow_column(
             ArrowArrayStream&& stream,
-            cuda_stream_view cuda_stream,
+            cudaStream_t cuda_stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
         column_view view() except +libcudf_exception_handler
@@ -84,13 +84,13 @@ cdef extern from "cudf/interop.hpp" namespace "cudf::interop" \
     cdef cppclass arrow_table:
         arrow_table(
             ArrowArrayStream&& stream,
-            cuda_stream_view cuda_stream,
+            cudaStream_t cuda_stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
         arrow_table(
             ArrowSchema&& schema,
             ArrowDeviceArray&& array,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
         table_view view() except +libcudf_exception_handler
@@ -135,7 +135,7 @@ cdef extern from *:
     template <typename ViewType>
     ArrowArray* to_arrow_host_raw(
       ViewType const& obj,
-      rmm::cuda_stream_view stream,
+      cudaStream_t stream,
       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) {
       ArrowArray *arr = new ArrowArray();
       auto device_arr = cudf::to_arrow_host(obj, stream, mr);
@@ -175,7 +175,7 @@ cdef extern from *:
     ArrowDeviceArray* to_arrow_device_raw(
       ViewType const& obj,
       PyObject* owner,
-      rmm::cuda_stream_view stream       = cudf::get_default_stream(),
+      cudaStream_t stream       = cudf::get_default_stream(),
       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) {
       auto tmp = cudf::to_arrow_device(obj, stream, mr);
 
@@ -222,11 +222,11 @@ cdef extern from *:
     ) except +libcudf_exception_handler nogil
     cdef ArrowArray* to_arrow_host_raw(
         const table_view& tbl,
-        cuda_stream_view stream,
+        cudaStream_t stream,
     ) except +libcudf_exception_handler nogil
     cdef ArrowArray* to_arrow_host_raw(
         const column_view& tbl,
-        cuda_stream_view stream,
+        cudaStream_t stream,
     ) except +libcudf_exception_handler nogil
     cdef void release_arrow_array_raw(
         ArrowArray *
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd b/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd
index ff84ad922fc..521147218bf 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd
@@ -5,7 +5,7 @@ from libcpp.string cimport string
 from libcpp.vector cimport vector
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -49,6 +49,6 @@ cdef extern from "cudf/io/avro.hpp" namespace "cudf::io" nogil:
 
     cdef cudf_io_types.table_with_metadata read_avro(
         avro_reader_options &options,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd
index 31f626b7d9d..45987fbedcd 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd
@@ -10,7 +10,7 @@ from libcpp.string cimport string
 from libcpp.vector cimport vector
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.types cimport data_type, size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 cdef extern from "cudf/io/csv.hpp" \
@@ -263,7 +263,7 @@ cdef extern from "cudf/io/csv.hpp" \
 
     cdef cudf_io_types.table_with_metadata read_csv(
         csv_reader_options &options,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -337,7 +337,7 @@ cdef extern from "cudf/io/csv.hpp" \
 
     cdef void write_csv(
         csv_writer_options args,
-        cuda_stream_view stream,
+        cudaStream_t stream,
     ) except +libcudf_exception_handler
 
     cdef bool is_supported_write_csv(
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/hybrid_scan.pxd b/python/pylibcudf/pylibcudf/libcudf/io/hybrid_scan.pxd
index 9f7462f6b86..8578908fc43 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/hybrid_scan.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/hybrid_scan.pxd
@@ -15,7 +15,7 @@ from pylibcudf.libcudf.io.text cimport byte_range_info
 from pylibcudf.libcudf.io.types cimport table_with_metadata
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.libcudf.utilities.span cimport device_span, host_span
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 ctypedef const uint8_t const_uint8_t
@@ -61,7 +61,7 @@ cdef extern from "cudf/io/experimental/hybrid_scan.hpp" \
         vector[size_type] filter_row_groups_with_stats(
             host_span[const_size_type] row_group_indices,
             const parquet_reader_options& options,
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
 
         pair[
@@ -75,20 +75,20 @@ cdef extern from "cudf/io/experimental/hybrid_scan.hpp" \
             host_span[const_device_span_const_uint8_t] dictionary_page_data,
             host_span[const_size_type] row_group_indices,
             const parquet_reader_options& options,
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
 
         vector[size_type] filter_row_groups_with_bloom_filters(
             host_span[const_device_span_const_uint8_t] bloom_filter_data,
             host_span[const_size_type] row_group_indices,
             const parquet_reader_options& options,
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
 
         unique_ptr[column] build_row_mask_with_page_index_stats(
             host_span[const_size_type] row_group_indices,
             const parquet_reader_options& options,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
@@ -103,7 +103,7 @@ cdef extern from "cudf/io/experimental/hybrid_scan.hpp" \
             mutable_column_view& row_mask,
             use_data_page_mask mask_data_pages,
             const parquet_reader_options& options,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
@@ -118,7 +118,7 @@ cdef extern from "cudf/io/experimental/hybrid_scan.hpp" \
             const column_view& row_mask,
             use_data_page_mask mask_data_pages,
             const parquet_reader_options& options,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
@@ -131,7 +131,7 @@ cdef extern from "cudf/io/experimental/hybrid_scan.hpp" \
             host_span[const_size_type] row_group_indices,
             host_span[const_device_span_const_uint8_t] column_chunk_data,
             const parquet_reader_options& options,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
@@ -143,7 +143,7 @@ cdef extern from "cudf/io/experimental/hybrid_scan.hpp" \
             use_data_page_mask mask_data_pages,
             host_span[const_device_span_const_uint8_t] column_chunk_data,
             const parquet_reader_options& options,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
@@ -159,7 +159,7 @@ cdef extern from "cudf/io/experimental/hybrid_scan.hpp" \
             use_data_page_mask mask_data_pages,
             host_span[const_device_span_const_uint8_t] column_chunk_data,
             const parquet_reader_options& options,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
index 6d5a506d18a..af3b1e59bd1 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
@@ -11,7 +11,7 @@ from libcpp.string cimport string
 from libcpp.vector cimport vector
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.types cimport data_type, size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -158,7 +158,7 @@ cdef extern from "cudf/io/json.hpp" namespace "cudf::io" nogil:
 
     cdef cudf_io_types.table_with_metadata read_json(
         json_reader_options &options,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -240,7 +240,7 @@ cdef extern from "cudf/io/json.hpp" namespace "cudf::io" nogil:
 
     cdef cudf_io_types.table_with_metadata write_json(
         json_writer_options &options,
-        cuda_stream_view stream
+        cudaStream_t stream
     ) except +libcudf_exception_handler
 
     cdef bool is_supported_write_json(
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
index 0455c0fa1b1..bea5c1e06f0 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
@@ -11,7 +11,7 @@ from libcpp.string cimport string
 from libcpp.vector cimport vector
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.types cimport data_type, size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -80,7 +80,7 @@ cdef extern from "cudf/io/orc.hpp" namespace "cudf::io" nogil:
 
     cdef cudf_io_types.table_with_metadata read_orc(
         orc_reader_options opts,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr,
     ) except +libcudf_exception_handler
 
@@ -150,7 +150,7 @@ cdef extern from "cudf/io/orc.hpp" namespace "cudf::io" nogil:
 
     cdef void write_orc(
         orc_writer_options options,
-        cuda_stream_view stream,
+        cudaStream_t stream,
     ) except +libcudf_exception_handler
 
     cdef bool is_supported_read_orc(
@@ -228,7 +228,7 @@ cdef extern from "cudf/io/orc.hpp" namespace "cudf::io" nogil:
         orc_chunked_writer() except +libcudf_exception_handler
         orc_chunked_writer(
             chunked_orc_writer_options args,
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
         orc_chunked_writer& write(
             cudf_table_view.table_view table_,
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
index e0c67e14e1d..f365a45b34a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
 from libcpp cimport bool
@@ -8,7 +8,7 @@ from libcpp.vector cimport vector
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.io cimport types as cudf_io_types
 from pylibcudf.variant cimport monostate, variant
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 cdef extern from "cudf/io/orc_metadata.hpp" \
@@ -71,5 +71,5 @@ cdef extern from "cudf/io/orc_metadata.hpp" \
 
     cdef parsed_orc_statistics read_parsed_orc_statistics(
         const cudf_io_types.source_info& src_info,
-        cuda_stream_view stream
+        cudaStream_t stream
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
index dc0dff818a3..00b62e55514 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
@@ -22,7 +22,7 @@ from pylibcudf.libcudf.io.types cimport (
 )
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport data_type, size_type, type_id
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -124,7 +124,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
     cdef table_with_metadata read_parquet(
         parquet_reader_options args,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -256,7 +256,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
     cdef unique_ptr[vector[uint8_t]] write_parquet(
         parquet_writer_options options,
-        cuda_stream_view stream,
+        cudaStream_t stream,
     ) except +libcudf_exception_handler
 
     cdef bool is_supported_read_parquet(
@@ -288,7 +288,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer() except +libcudf_exception_handler
         chunked_parquet_writer(
             const chunked_parquet_writer_options& args,
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
         chunked_parquet_writer& write(
             const table_view& table_,
@@ -303,14 +303,14 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_reader(
             size_t chunk_read_limit,
             const parquet_reader_options& options,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
         chunked_parquet_reader(
             size_t chunk_read_limit,
             size_t pass_read_limit,
             const parquet_reader_options& options,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
         bool has_next() except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/text.pxd b/python/pylibcudf/pylibcudf/libcudf/io/text.pxd
index 77552a80cfd..7152e5d0afb 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/text.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/text.pxd
@@ -6,7 +6,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -63,6 +63,6 @@ cdef extern from "cudf/io/text/multibyte_split.hpp" \
         data_chunk_source source,
         string delimiter,
         parse_options options,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd b/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd
index 557e8856b28..45cfb4f15da 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd
@@ -6,7 +6,7 @@ from libcpp.optional cimport optional
 from libcpp.string cimport string
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.table.table cimport table
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -14,6 +14,6 @@ cdef extern from "cudf/timezone.hpp" namespace "cudf" nogil:
     unique_ptr[table] make_timezone_transition_table(
         optional[string] tzif_dir,
         string timezone_name,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd
index 06a7d497ad5..d13bf245119 100644
--- a/python/pylibcudf/pylibcudf/libcudf/join.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd
@@ -13,7 +13,7 @@ from pylibcudf.libcudf.expressions cimport expression
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport null_equality, size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 from rmm.librmm.device_uvector cimport device_uvector
@@ -28,7 +28,7 @@ cdef extern from "cudf/join/join.hpp" namespace "cudf" nogil:
         const table_view left_keys,
         const table_view right_keys,
         null_equality nulls_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -36,7 +36,7 @@ cdef extern from "cudf/join/join.hpp" namespace "cudf" nogil:
         const table_view left_keys,
         const table_view right_keys,
         null_equality nulls_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -44,7 +44,7 @@ cdef extern from "cudf/join/join.hpp" namespace "cudf" nogil:
         const table_view left_keys,
         const table_view right_keys,
         null_equality nulls_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -52,7 +52,7 @@ cdef extern from "cudf/join/join.hpp" namespace "cudf" nogil:
         const table_view left_keys,
         const table_view right_keys,
         null_equality nulls_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -60,7 +60,7 @@ cdef extern from "cudf/join/join.hpp" namespace "cudf" nogil:
         const table_view left_keys,
         const table_view right_keys,
         null_equality nulls_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -68,14 +68,14 @@ cdef extern from "cudf/join/join.hpp" namespace "cudf" nogil:
         const table_view left_keys,
         const table_view right_keys,
         null_equality nulls_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] cross_join(
         const table_view left,
         const table_view right,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -84,7 +84,7 @@ cdef extern from "cudf/join/conditional_join.hpp" namespace "cudf" nogil:
         const table_view left,
         const table_view right,
         const expression binary_predicate,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -93,7 +93,7 @@ cdef extern from "cudf/join/conditional_join.hpp" namespace "cudf" nogil:
         const table_view right,
         const expression binary_predicate,
         optional[size_t] output_size,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -101,7 +101,7 @@ cdef extern from "cudf/join/conditional_join.hpp" namespace "cudf" nogil:
         const table_view left,
         const table_view right,
         const expression binary_predicate,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -110,7 +110,7 @@ cdef extern from "cudf/join/conditional_join.hpp" namespace "cudf" nogil:
         const table_view right,
         const expression binary_predicate,
         optional[size_t] output_size,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -118,7 +118,7 @@ cdef extern from "cudf/join/conditional_join.hpp" namespace "cudf" nogil:
         const table_view left,
         const table_view right,
         const expression binary_predicate,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -126,7 +126,7 @@ cdef extern from "cudf/join/conditional_join.hpp" namespace "cudf" nogil:
         const table_view left,
         const table_view right,
         const expression binary_predicate,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -135,7 +135,7 @@ cdef extern from "cudf/join/conditional_join.hpp" namespace "cudf" nogil:
         const table_view right,
         const expression binary_predicate,
         optional[size_t] output_size,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -143,7 +143,7 @@ cdef extern from "cudf/join/conditional_join.hpp" namespace "cudf" nogil:
         const table_view left,
         const table_view right,
         const expression binary_predicate,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -152,7 +152,7 @@ cdef extern from "cudf/join/conditional_join.hpp" namespace "cudf" nogil:
         const table_view right,
         const expression binary_predicate,
         optional[size_t] output_size,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -165,7 +165,7 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil:
         const expression binary_predicate,
         null_equality compare_nulls,
         output_size_data_type output_size_data,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -177,7 +177,7 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil:
         const expression binary_predicate,
         null_equality compare_nulls,
         output_size_data_type output_size_data,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -189,7 +189,7 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil:
         const expression binary_predicate,
         null_equality compare_nulls,
         output_size_data_type output_size_data,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -200,7 +200,7 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil:
         const table_view right_conditional,
         const expression binary_predicate,
         null_equality compare_nulls,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -211,7 +211,7 @@ cdef extern from "cudf/join/mixed_join.hpp" namespace "cudf" nogil:
         const table_view right_conditional,
         const expression binary_predicate,
         null_equality compare_nulls,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -225,21 +225,21 @@ cdef extern from "cudf/join/filtered_join.hpp" namespace "cudf" nogil:
         filtered_join(
             const table_view build,
             null_equality compare_nulls,
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
         filtered_join(
             const table_view build,
             null_equality compare_nulls,
             double load_factor,
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
         gather_map_type semi_join(
             const table_view probe,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
         gather_map_type anti_join(
             const table_view probe,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/json.pxd b/python/pylibcudf/pylibcudf/libcudf/json.pxd
index 39899490cac..bb606b86b33 100644
--- a/python/pylibcudf/pylibcudf/libcudf/json.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/json.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -30,6 +30,6 @@ cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil:
         column_view col,
         string_scalar json_path,
         get_json_object_options options,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
index ad9611511dd..0b2c1651714 100644
--- a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
@@ -6,7 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -21,6 +21,6 @@ cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil:
         inclusive left_inclusive,
         const column_view &right_edges,
         inclusive right_inclusive,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
index 66e90dcd66a..310d166df59 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
@@ -7,7 +7,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.table.table_view cimport table_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -21,19 +21,19 @@ cdef extern from "cudf/lists/combine.hpp" namespace \
     cdef unique_ptr[column] concatenate_rows(
         const table_view input_table,
         concatenate_null_policy null_policy,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] concatenate_list_elements(
         const table_view input_table,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] concatenate_list_elements(
         const column_view input_table,
         concatenate_null_policy null_policy,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
index efb2d760366..3736e42b32d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
@@ -7,7 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -20,20 +20,20 @@ cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] contains(
         const lists_column_view& lists,
         const scalar& search_key,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] contains(
         const lists_column_view& lists,
         const column_view& search_keys,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] contains_nulls(
         const lists_column_view& lists,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -41,7 +41,7 @@ cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
         const lists_column_view& lists,
         const scalar& search_key,
         duplicate_find_option find_option,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -49,6 +49,6 @@ cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
         const lists_column_view& lists,
         const column_view& search_keys,
         duplicate_find_option find_option,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
index 6203bafdc38..6fa64c8b291 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
@@ -4,13 +4,13 @@ from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
 cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] count_elements(
         const lists_column_view&,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd
index b31d3a7cdca..fa15fb1eeef 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -13,6 +13,6 @@ cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
     cdef unique_ptr[table] explode_outer(
         const table_view,
         size_type explode_column_idx,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd
index c82a9029311..66a07f41e38 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column, column_view
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -13,12 +13,12 @@ cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] extract_list_element(
         const lists_column_view&,
         size_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[column] extract_list_element(
         const lists_column_view&,
         const column_view&,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
index 11cc19b86f9..1e55916d299 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
@@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -12,7 +12,7 @@ cdef extern from "cudf/lists/filling.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] sequences(
         const column_view& starts,
         const column_view& sizes,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -20,6 +20,6 @@ cdef extern from "cudf/lists/filling.hpp" namespace "cudf::lists" nogil:
         const column_view& starts,
         const column_view& steps,
         const column_view& sizes,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd
index bae67a96b0d..b7212bea51e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.copying cimport out_of_bounds_policy
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
@@ -13,6 +13,6 @@ cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
         const lists_column_view& source_column,
         const lists_column_view& gather_map_list,
         out_of_bounds_policy bounds_policy,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd
index fe1630c1728..69a6c80f242 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -1,6 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column_view cimport (
@@ -26,7 +26,7 @@ cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
         column_view offsets() except +libcudf_exception_handler
         column_view child() except +libcudf_exception_handler
         column_view get_sliced_child(
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
 
     cdef enum:
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd
index f831024ec82..e60c8acbb38 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd
@@ -4,13 +4,13 @@ from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
 cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] reverse(
         const lists_column_view& lists_column,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
index 5e02d11d95a..b56caa9adb5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.types cimport nan_equality, null_equality
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -15,7 +15,7 @@ cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil:
         const lists_column_view& rhs,
         null_equality nulls_equal,
         nan_equality nans_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -24,7 +24,7 @@ cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil:
         const lists_column_view& rhs,
         null_equality nulls_equal,
         nan_equality nans_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -33,7 +33,7 @@ cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil:
         const lists_column_view& rhs,
         null_equality nulls_equal,
         nan_equality nans_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -42,6 +42,6 @@ cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil:
         const lists_column_view& rhs,
         null_equality nulls_equal,
         nan_equality nans_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
index 4036ccec6c5..9899591d6d1 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.types cimport null_order, order
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -14,7 +14,7 @@ cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
         const lists_column_view source_column,
         order column_order,
         null_order null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -22,6 +22,6 @@ cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
         const lists_column_view source_column,
         order column_order,
         null_order null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
index dec32027402..0187642e0c7 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 from pylibcudf.libcudf.stream_compaction cimport duplicate_keep_option
 from pylibcudf.libcudf.types cimport nan_equality, null_equality
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -15,7 +15,7 @@ cdef extern from "cudf/lists/stream_compaction.hpp" \
     cdef unique_ptr[column] apply_boolean_mask(
         const lists_column_view& lists_column,
         const lists_column_view& boolean_mask,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -24,6 +24,6 @@ cdef extern from "cudf/lists/stream_compaction.hpp" \
         null_equality nulls_equal,
         nan_equality nans_equal,
         duplicate_keep_option keep_option,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/merge.pxd b/python/pylibcudf/pylibcudf/libcudf/merge.pxd
index 860e4263c1c..f4389ac991a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/merge.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/merge.pxd
@@ -7,7 +7,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -17,6 +17,6 @@ cdef extern from "cudf/merge.hpp" namespace "cudf" nogil:
         vector[libcudf_types.size_type] key_cols,
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
index 1b1b3001981..330c69f0579 100644
--- a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
@@ -8,14 +8,14 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type
 
 from rmm.librmm.device_buffer cimport device_buffer
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
 cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
     cdef device_buffer copy_bitmask (
         column_view view,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -23,7 +23,7 @@ cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
         const bitmask_type* null_mask,
         size_type begin_bit,
         size_type end_bit,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -35,19 +35,19 @@ cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
     cdef device_buffer create_null_mask (
         size_type size,
         mask_state state,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef pair[device_buffer, size_type] bitmask_and(
         table_view view,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     )
 
     cdef pair[device_buffer, size_type] bitmask_or(
         table_view view,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     )
 
@@ -55,12 +55,12 @@ cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
         const bitmask_type * bitmask,
         size_type start,
         size_type stop,
-        cuda_stream_view stream
+        cudaStream_t stream
     )
 
     cdef size_type index_of_first_set_bit(
         const bitmask_type * bitmask,
         size_type start,
         size_type stop,
-        cuda_stream_view stream
+        cudaStream_t stream
     )
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
index eca30faa630..94a7fe3db9d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
@@ -6,7 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -17,7 +17,7 @@ cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[bpe_merge_pairs] load_merge_pairs(
         const column_view &merge_pairs,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -25,6 +25,6 @@ cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
         const column_view &strings,
         const bpe_merge_pairs &merge_pairs,
         const string_scalar &separator,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/deduplicate.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/deduplicate.pxd
index 26e39c963d2..82a8581ea0a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/deduplicate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/deduplicate.pxd
@@ -7,7 +7,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 from rmm.librmm.device_uvector cimport device_uvector
@@ -19,7 +19,7 @@ cdef extern from "nvtext/deduplicate.hpp" namespace "nvtext" nogil:
     cdef suffix_array_type build_suffix_array(
         column_view source_strings,
         size_type min_width,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -27,7 +27,7 @@ cdef extern from "nvtext/deduplicate.hpp" namespace "nvtext" nogil:
         column_view source_strings,
         column_view indices,
         size_type min_width,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -37,6 +37,6 @@ cdef extern from "nvtext/deduplicate.hpp" namespace "nvtext" nogil:
         column_view input2,
         column_view indices2,
         size_type min_width,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd
index b7f3e97a4b0..f3c10c11abf 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd
@@ -6,7 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -15,12 +15,12 @@ cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil:
     cdef unique_ptr[column] edit_distance(
         const column_view & strings,
         const column_view & targets,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] edit_distance_matrix(
         const column_view & strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
index 43619d356f6..3d97aaf93b1 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
@@ -7,7 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -17,14 +17,14 @@ cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
         const column_view &strings,
         size_type ngrams,
         const string_scalar & separator,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] generate_character_ngrams(
         const column_view &strings,
         size_type ngrams,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -32,6 +32,6 @@ cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
         const column_view &strings,
         size_type ngrams,
         uint32_t seed,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd
index de45913fbb5..0a3ba52a3d5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -15,6 +15,6 @@ cdef extern from "nvtext/jaccard.hpp" namespace "nvtext" nogil:
         const column_view &input1,
         const column_view &input2,
         size_type width,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index eaf0b8c63b1..94083fbafd3 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -7,7 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -19,7 +19,7 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const column_view &a,
         const column_view &b,
         const size_type width,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -29,7 +29,7 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const column_view &a,
         const column_view &b,
         const size_type width,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -39,7 +39,7 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const uint32_t seed,
         const column_view &a,
         const column_view &b,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -49,6 +49,6 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const uint64_t seed,
         const column_view &a,
         const column_view &b,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
index 41d153b99a0..6e4cc18e17f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -17,6 +17,6 @@ cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil:
         size_type ngrams,
         const string_scalar & delimiter,
         const string_scalar & separator,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
index 25678d12091..0184c1d8785 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -13,7 +13,7 @@ cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] normalize_spaces(
         const column_view & strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -23,13 +23,13 @@ cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil:
     cdef unique_ptr[character_normalizer] create_character_normalizer(
         bool do_lower_case,
         const column_view & strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] normalize_characters(
         const column_view & strings,
         const character_normalizer & normalizer,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd
index d14ce40b168..628181b3f89 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -17,7 +17,7 @@ cdef extern from "nvtext/replace.hpp" namespace "nvtext" nogil:
         const column_view & targets,
         const column_view & replacements,
         const string_scalar & delimiter,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -26,6 +26,6 @@ cdef extern from "nvtext/replace.hpp" namespace "nvtext" nogil:
         size_type min_token_length,
         const string_scalar & replacement,
         const string_scalar & delimiter,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
index e6e2866008b..2088440749a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
@@ -7,7 +7,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -18,7 +18,7 @@ cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] porter_stemmer_measure(
         const column_view & strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -26,12 +26,12 @@ cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
         column_view source_strings,
         letter_type ltype,
         size_type character_index,
-        cuda_stream_view stream) except +libcudf_exception_handler
+        cudaStream_t stream) except +libcudf_exception_handler
 
     cdef unique_ptr[column] is_letter(
         column_view source_strings,
         letter_type ltype,
         column_view indices,
-        cuda_stream_view stream) except +libcudf_exception_handler
+        cudaStream_t stream) except +libcudf_exception_handler
 
 ctypedef int32_t underlying_type_t_letter_type
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd
index 3b7ae2e9b6f..1c6eccb0476 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -15,34 +15,34 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
     cdef unique_ptr[column] tokenize(
         const column_view & strings,
         const string_scalar & delimiter,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] tokenize(
         const column_view & strings,
         const column_view & delimiters,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] count_tokens(
         const column_view & strings,
         const string_scalar & delimiter,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] count_tokens(
         const column_view & strings,
         const column_view & delimiters,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] character_tokenize(
         const column_view & strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -50,7 +50,7 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
         const column_view & strings,
         const column_view & row_indices,
         const string_scalar & separator,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -59,7 +59,7 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[tokenize_vocabulary] load_vocabulary(
         const column_view & strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -68,6 +68,6 @@ cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
         const tokenize_vocabulary & vocabulary,
         const string_scalar & delimiter,
         size_type default_id,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd
index a4bcde47f80..0c43f0d21ff 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/wordpiece_tokenize.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -16,7 +16,7 @@ cdef extern from "nvtext/wordpiece_tokenize.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[wordpiece_vocabulary] load_wordpiece_vocabulary(
         const column_view & strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -24,6 +24,6 @@ cdef extern from "nvtext/wordpiece_tokenize.hpp" namespace "nvtext" nogil:
         const column_view & strings,
         const wordpiece_vocabulary & vocabulary,
         size_type max_tokens_per_row,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
index e7c0f496de8..2e0c978f77d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.hash cimport DEFAULT_HASH_SEED
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
@@ -28,7 +28,7 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
         int num_partitions,
         hash_id hash_function,
         uint32_t seed,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -39,7 +39,7 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
         int num_partitions,
         hash_id hash_function,
         uint32_t seed,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -48,7 +48,7 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
         const table_view& t,
         const column_view& partition_map,
         int num_partitions,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -57,6 +57,6 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
         const table_view& input,
         int num_partitions,
         int start_partition,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd b/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd
index 823bd34e4a7..8bc636da998 100644
--- a/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd
@@ -15,7 +15,7 @@ from pylibcudf.libcudf.types cimport (
     order_info,
     sorted,
 )
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -27,7 +27,7 @@ cdef extern from "cudf/quantiles.hpp" namespace "cudf" nogil:
         interpolation interp,
         column_view ordered_indices,
         bool exact,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -38,6 +38,6 @@ cdef extern from "cudf/quantiles.hpp" namespace "cudf" nogil:
         sorted is_input_sorted,
         vector[order] column_order,
         vector[null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/reduce.pxd b/python/pylibcudf/pylibcudf/libcudf/reduce.pxd
index 9da4159d0c1..5fb383149a7 100644
--- a/python/pylibcudf/pylibcudf/libcudf/reduce.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/reduce.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.types cimport data_type, null_policy
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 ctypedef const scalar constscalar
@@ -22,7 +22,7 @@ cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil:
         const reduce_aggregation& agg,
         data_type output_type,
         optional[reference_wrapper[constscalar]] init,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -35,13 +35,13 @@ cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil:
         const scan_aggregation& agg,
         scan_type inclusive,
         null_policy null_handling,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef pair[unique_ptr[scalar], unique_ptr[scalar]] minmax(
         const column_view& col,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/replace.pxd
index 35078b64ee3..4821a13924c 100644
--- a/python/pylibcudf/pylibcudf/libcudf/replace.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/replace.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column_view cimport (
     mutable_column_view,
 )
 from pylibcudf.libcudf.scalar.scalar cimport scalar
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -22,47 +22,47 @@ cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:
     cdef unique_ptr[column] replace_nulls(
         column_view source_column,
         column_view replacement_column,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] replace_nulls(
         column_view source_column,
         scalar replacement,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] replace_nulls(
         column_view source_column,
         replace_policy replace_policy,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] find_and_replace_all(
         column_view source_column,
         column_view values_to_replace,
         column_view replacement_values,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] clamp(
         column_view source_column,
         scalar lo, scalar lo_replace,
         scalar hi, scalar hi_replace,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] clamp(
         column_view source_column,
         scalar lo, scalar hi,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] normalize_nans_and_zeros(
         column_view source_column,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef void normalize_nans_and_zeros(
         mutable_column_view source_column,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/reshape.pxd b/python/pylibcudf/pylibcudf/libcudf/reshape.pxd
index 598e148d643..beda4ec09fc 100644
--- a/python/pylibcudf/pylibcudf/libcudf/reshape.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/reshape.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport size_type, data_type
 from pylibcudf.libcudf.utilities.span cimport device_span
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 cdef extern from "cuda/functional" namespace "cuda::std":
@@ -19,17 +19,17 @@ cdef extern from "cuda/functional" namespace "cuda::std":
 cdef extern from "cudf/reshape.hpp" namespace "cudf" nogil:
     cdef unique_ptr[column] interleave_columns(
         table_view source_table,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[table] tile(
         table_view source_table,
         size_type count,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef void table_to_array(
         table_view input_table,
         device_span[byte] output,
-        cuda_stream_view stream
+        cudaStream_t stream
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/rolling.pxd b/python/pylibcudf/pylibcudf/libcudf/rolling.pxd
index 6ea400f92d3..69cdbd6f396 100644
--- a/python/pylibcudf/pylibcudf/libcudf/rolling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/rolling.pxd
@@ -12,7 +12,7 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport data_type, null_order, order, size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -44,7 +44,7 @@ cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil:
         range_window_type preceding,
         range_window_type following,
         vector[rolling_request]& requests,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -54,7 +54,7 @@ cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil:
         column_view following_window,
         size_type min_periods,
         rolling_aggregation& agg,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] rolling_window(
@@ -63,7 +63,7 @@ cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil:
         size_type following_window,
         size_type min_periods,
         rolling_aggregation& agg,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef pair[unique_ptr[column], unique_ptr[column]] make_range_windows(
@@ -73,7 +73,7 @@ cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil:
         null_order null_order,
         range_window_type preceding,
         range_window_type following,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/round.pxd b/python/pylibcudf/pylibcudf/libcudf/round.pxd
index 39965d025c6..f21987844f3 100644
--- a/python/pylibcudf/pylibcudf/libcudf/round.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/round.pxd
@@ -6,7 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -20,7 +20,7 @@ cdef extern from "cudf/round.hpp" namespace "cudf" nogil:
         const column_view& input,
         int32_t decimal_places,
         rounding_method method,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -28,6 +28,6 @@ cdef extern from "cudf/round.hpp" namespace "cudf" nogil:
         const column_view& input,
         int32_t decimal_places,
         rounding_method method,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
index 6c3dc71e019..10d3a42c572 100644
--- a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.fixed_point.fixed_point cimport scale_type
 from pylibcudf.libcudf.table.table_view cimport table_view
 from pylibcudf.libcudf.types cimport data_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -18,31 +18,31 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
         scalar(scalar other) except +libcudf_exception_handler
         data_type type() except +libcudf_exception_handler
         void set_valid_async(
-            bool is_valid, cuda_stream_view stream
+            bool is_valid, cudaStream_t stream
         ) except +libcudf_exception_handler
-        bool is_valid(cuda_stream_view stream) except +libcudf_exception_handler
+        bool is_valid(cudaStream_t stream) except +libcudf_exception_handler
 
     cdef cppclass numeric_scalar[T](scalar):
         void set_value(
             T value,
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
-        T value(cuda_stream_view stream) except +libcudf_exception_handler
+        T value(cudaStream_t stream) except +libcudf_exception_handler
 
     cdef cppclass timestamp_scalar[T](scalar):
         void set_value(
             T value,
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
 
     cdef cppclass duration_scalar[T](scalar):
         void set_value(
             T value,
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
 
     cdef cppclass string_scalar(scalar):
-        string to_string(cuda_stream_view stream) except +libcudf_exception_handler
+        string to_string(cudaStream_t stream) except +libcudf_exception_handler
 
     cdef cppclass list_scalar(scalar):
         pass
@@ -57,4 +57,4 @@ cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
             scale_type scale,
             bool is_valid
         ) except +libcudf_exception_handler
-        T value(cuda_stream_view stream) except +libcudf_exception_handler
+        T value(cudaStream_t stream) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
index 6034b2ecc08..6b1329962cd 100644
--- a/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
@@ -9,49 +9,49 @@ from pylibcudf.libcudf.scalar.scalar cimport scalar
 from pylibcudf.libcudf.fixed_point.fixed_point cimport scale_type
 from pylibcudf.libcudf.types cimport int128 as int128_t
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
 cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
     cdef unique_ptr[scalar] make_string_scalar(
         const string & _string,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[scalar] make_fixed_width_scalar[T](
         T value,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[scalar] make_fixed_point_scalar[T](
         int128_t value,
         scale_type scale,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[scalar] make_numeric_scalar(
         data_type type_,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[scalar] make_timestamp_scalar(
         data_type type_,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[scalar] make_empty_scalar_like(
         const column_view &,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[scalar] make_duration_scalar(
         data_type type_,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef unique_ptr[scalar] make_default_constructed_scalar(
         data_type type_,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/search.pxd b/python/pylibcudf/pylibcudf/libcudf/search.pxd
index b369ec05392..c1e41893d2e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/search.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/search.pxd
@@ -7,7 +7,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.table.table_view cimport table_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -18,7 +18,7 @@ cdef extern from "cudf/search.hpp" namespace "cudf" nogil:
         table_view needles,
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -27,13 +27,13 @@ cdef extern from "cudf/search.hpp" namespace "cudf" nogil:
         table_view needles,
         vector[libcudf_types.order] column_order,
         vector[libcudf_types.null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] contains(
         column_view haystack,
         column_view needles,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd
index 97822e2c374..c8e252ced2c 100644
--- a/python/pylibcudf/pylibcudf/libcudf/sorting.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd
@@ -17,7 +17,7 @@ from pylibcudf.libcudf.types cimport (
     null_order,
     size_type
 )
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -26,7 +26,7 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         table_view source_table,
         vector[order] column_order,
         vector[null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -34,7 +34,7 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         table_view source_table,
         vector[order] column_order,
         vector[null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -45,7 +45,7 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         null_policy null_handling,
         null_order null_precedence,
         bool percentage,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -53,7 +53,7 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         const table_view& table,
         vector[order] column_order,
         vector[null_order] null_precedence,
-        cuda_stream_view stream
+        cudaStream_t stream
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] segmented_sort_by_key(
@@ -62,7 +62,7 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         const column_view& segment_offsets,
         vector[order] column_order,
         vector[null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -72,7 +72,7 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         const column_view& segment_offsets,
         vector[order] column_order,
         vector[null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -81,7 +81,7 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         const table_view& keys,
         vector[order] column_order,
         vector[null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -90,7 +90,7 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         const table_view& keys,
         vector[order] column_order,
         vector[null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -98,7 +98,7 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         table_view source_table,
         vector[order] column_order,
         vector[null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -106,7 +106,7 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         table_view source_table,
         vector[order] column_order,
         vector[null_order] null_precedence,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -114,7 +114,7 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         const column_view& col,
         size_type k,
         order sort_order,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -122,6 +122,6 @@ cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
         const column_view& col,
         size_type k,
         order sort_order,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
index 0358aa4068c..9f8686da472 100644
--- a/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
@@ -14,7 +14,7 @@ from pylibcudf.libcudf.types cimport (
     null_equality,
     size_type,
 )
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -29,7 +29,7 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
         table_view source_table,
         vector[size_type] keys,
         size_type keep_threshold,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -37,14 +37,14 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
         table_view source_table,
         vector[size_type] keys,
         size_type keep_threshold,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] apply_boolean_mask(
         table_view source_table,
         column_view boolean_mask,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -53,7 +53,7 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
         vector[size_type] keys,
         duplicate_keep_option keep,
         null_equality nulls_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -63,7 +63,7 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
         duplicate_keep_option keep,
         null_equality nulls_equal,
         nan_equality nans_equals,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -72,7 +72,7 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
         duplicate_keep_option keep,
         null_equality nulls_equal,
         nan_equality nans_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -82,7 +82,7 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
         duplicate_keep_option keep,
         null_equality nulls_equal,
         nan_equality nans_equal,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -90,6 +90,6 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
         table_view predicate_table,
         const expression& predicate_expr,
         table_view filter_table,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd
index 06e95c95870..0cee9e43346 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd
@@ -4,7 +4,7 @@ from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -12,15 +12,15 @@ cdef extern from "cudf/strings/attributes.hpp" namespace "cudf::strings" nogil:
 
     cdef unique_ptr[column] count_characters(
         column_view source_strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] count_bytes(
         column_view source_strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] code_points(
         column_view source_strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd
index b615cd984db..7b8ac094311 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.strings.char_types cimport string_character_types
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -14,18 +14,18 @@ cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] capitalize(
         const column_view & strings,
         const string_scalar & delimiters,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] title(
         const column_view & strings,
         string_character_types sequence_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
         ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] is_title(
         const column_view & strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd
index 463586d9f37..a056f1b4737 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd
@@ -4,22 +4,22 @@ from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
 cdef extern from "cudf/strings/case.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] to_lower(
         const column_view & strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] to_upper(
         const column_view & strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] swapcase(
         const column_view & strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
index 7706498eceb..c6af0fb73d2 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
@@ -6,7 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -29,7 +29,7 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \
         column_view source_strings,
         string_character_types types,
         string_character_types verify_types,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] filter_characters_of_type(
@@ -37,5 +37,5 @@ cdef extern from "cudf/strings/char_types/char_types.hpp" \
         string_character_types types_to_remove,
         string_scalar replacement,
         string_character_types types_to_keep,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
index ef831d3b167..2e2b6656797 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.table.table_view cimport table_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -27,7 +27,7 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
         string_scalar separator,
         string_scalar narep,
         separator_on_nulls separate_nulls,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] concatenate(
@@ -36,14 +36,14 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
         string_scalar separator_narep,
         string_scalar col_narep,
         separator_on_nulls separate_nulls,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] join_strings(
         column_view input,
         string_scalar separator,
         string_scalar narep,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] join_list_elements(
@@ -53,7 +53,7 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
         string_scalar string_narep,
         separator_on_nulls separate_nulls,
         output_if_empty_list empty_list_policy,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] join_list_elements(
@@ -62,5 +62,5 @@ cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
         string_scalar narep,
         separator_on_nulls separate_nulls,
         output_if_empty_list empty_list_policy,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
index f60782e93b7..cc9a7c6835d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
@@ -7,7 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -16,31 +16,31 @@ cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] contains_re(
         column_view source_strings,
         regex_program,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] count_re(
         column_view source_strings,
         regex_program,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] matches_re(
         column_view source_strings,
         regex_program,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] like(
         column_view source_strings,
         string pattern,
         string escape_character,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] like(
         column_view source_strings,
         column_view patterns,
         string_scalar escape_character,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
index b5b837878f9..8875bc62ed5 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -15,12 +15,12 @@ cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \
     cdef unique_ptr[column] to_booleans(
         column_view input,
         string_scalar true_string,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] from_booleans(
         column_view booleans,
         string_scalar true_string,
         string_scalar false_string,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
index 5779839a685..92983f9dc49 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
@@ -7,7 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport data_type
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -17,18 +17,18 @@ cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \
         column_view input,
         data_type timestamp_type,
         string format,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] from_timestamps(
         column_view timestamps,
         string format,
         column_view names,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] is_timestamp(
         column_view input_col,
         string format,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
index 2eae8b987b9..4f22b715ef9 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
@@ -7,7 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport data_type
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -17,11 +17,11 @@ cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \
         const column_view & input,
         data_type duration_type,
         const string & format,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] from_durations(
         const column_view & durations,
         const string & format,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
index e5f512c331f..8aaa0ebf4c7 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport data_type
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -15,17 +15,17 @@ cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
     cdef unique_ptr[column] to_fixed_point(
         column_view input,
         data_type output_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] from_fixed_point(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] is_fixed_point(
         column_view input,
         data_type decimal_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
index 4ea1cd527f4..5a111c1979d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport data_type
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -15,16 +15,16 @@ cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
     cdef unique_ptr[column] to_floats(
         column_view strings,
         data_type output_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] from_floats(
         column_view floats,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] is_float(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
index 306c4b66758..4d3f4ff758a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport data_type
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -15,40 +15,40 @@ cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
     cdef unique_ptr[column] to_integers(
         column_view input,
         data_type output_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] from_integers(
         column_view integers,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] is_integer(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] is_integer(
         column_view input,
         data_type int_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] hex_to_integers(
         column_view input,
         data_type output_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] is_hex(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] integers_to_hex(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
index d12f3992d85..00a64787957 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -13,16 +13,16 @@ cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] ipv4_to_integers(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] integers_to_ipv4(
         column_view integers,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] is_ipv4(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
index 8ed381e87da..bfae49bae4b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -17,5 +17,5 @@ cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \
         column_view input,
         string_scalar na_rep,
         column_view separators,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
index b20c03f976b..db2d4f4efc0 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -13,10 +13,10 @@ cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \
         "cudf::strings" nogil:
     cdef unique_ptr[column] url_encode(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] url_decode(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
index 845de206dbf..d3e0d0fd35a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
@@ -7,7 +7,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.strings.regex_program cimport regex_program
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -16,18 +16,18 @@ cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[table] extract(
         column_view input,
         regex_program prog,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] extract_all_record(
         column_view input,
         regex_program prog,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] extract_single(
         column_view input,
         regex_program prog,
         size_type group,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd
index b8934aeb7fe..42752152de8 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd
@@ -7,7 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -16,37 +16,37 @@ cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] contains(
         column_view source_strings,
         string_scalar target,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] contains(
         column_view source_strings,
         column_view target_strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] ends_with(
         column_view source_strings,
         string_scalar target,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] ends_with(
         column_view source_strings,
         column_view target_strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] starts_with(
         column_view source_strings,
         string_scalar target,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] starts_with(
         column_view source_strings,
         column_view target_strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] find(
@@ -54,14 +54,14 @@ cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
         string_scalar target,
         size_type start,
         size_type stop,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] find(
         column_view source_strings,
         column_view target,
         size_type start,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] rfind(
@@ -69,5 +69,5 @@ cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
         string_scalar target,
         size_type start,
         size_type stop,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
index da751990053..1e42a476c13 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.table.table cimport table
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -15,11 +15,11 @@ cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
     cdef unique_ptr[table] contains_multiple(
         column_view input,
         column_view targets,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] find_multiple(
         column_view input,
         column_view targets,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
index 02ecbef7095..d72ffd09d8e 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.strings.regex_program cimport regex_program
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -14,11 +14,11 @@ cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] findall(
         column_view input,
         regex_program prog,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] find_re(
         column_view input,
         regex_program prog,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
index 5e3e5c43f61..8b291a22a05 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.strings.side_type cimport side_type
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -20,17 +20,17 @@ cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil:
         size_type width,
         side_type side,
         string fill_char,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] zfill(
         column_view input,
         size_type width,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] zfill_by_widths(
         column_view input,
         column_view widths,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
index 05a2954af35..86519de0b90 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -16,13 +16,13 @@ cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \
     cdef unique_ptr[column] repeat_strings(
         column_view input,
         size_type repeat_times,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] repeat_strings(
         column_view input,
         column_view repeat_times,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd
index 263b91475b8..cf2573af5ed 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd
@@ -8,7 +8,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -18,7 +18,7 @@ cdef extern from "cudf/strings/replace.hpp" namespace "cudf::strings" nogil:
         string_scalar repl,
         size_type start,
         size_type stop,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] replace(
@@ -26,12 +26,12 @@ cdef extern from "cudf/strings/replace.hpp" namespace "cudf::strings" nogil:
         string_scalar target,
         string_scalar repl,
         int32_t maxrepl,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] replace_multiple(
         column_view source_strings,
         column_view target_strings,
         column_view repl_strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
index 5f5cbaeaf55..d3e958841ab 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
@@ -11,7 +11,7 @@ from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
 from pylibcudf.libcudf.strings.regex_program cimport regex_program
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -22,7 +22,7 @@ cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:
         regex_program prog,
         string_scalar replacement,
         size_type max_replace_count,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] replace_re(
@@ -30,12 +30,12 @@ cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:
         vector[string] patterns,
         column_view replacements,
         regex_flags flags,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] replace_with_backrefs(
         column_view input,
         regex_program prog,
         string replacement,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/reverse.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/reverse.pxd
index 6e6fc2acac4..39a3ac4b769 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/reverse.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/reverse.pxd
@@ -4,12 +4,12 @@ from libcpp.memory cimport unique_ptr
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 cdef extern from "cudf/strings/reverse.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] reverse(
         column_view source_strings,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
index 0c99455ea33..6c9031482ca 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
@@ -7,7 +7,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.table.table cimport table
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -17,11 +17,11 @@ cdef extern from "cudf/strings/split/partition.hpp" namespace \
     cdef unique_ptr[table] partition(
         column_view input,
         string_scalar delimiter,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[table] rpartition(
         column_view input,
         string_scalar delimiter,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
index 9ed741b608a..5d14fefdb1b 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.strings.regex_program cimport regex_program
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -20,35 +20,35 @@ cdef extern from "cudf/strings/split/split.hpp" namespace \
         column_view strings_column,
         string_scalar delimiter,
         size_type maxsplit,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[table] rsplit(
         column_view strings_column,
         string_scalar delimiter,
         size_type maxsplit,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] split_record(
         column_view strings,
         string_scalar delimiter,
         size_type maxsplit,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] rsplit_record(
         column_view strings,
         string_scalar delimiter,
         size_type maxsplit,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] split_part(
         column_view strings,
         string_scalar delimiter,
         size_type index,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
 
@@ -59,26 +59,26 @@ cdef extern from "cudf/strings/split/split_re.hpp" namespace \
         const column_view& input,
         regex_program prog,
         size_type maxsplit,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[table] rsplit_re(
         const column_view& input,
         regex_program prog,
         size_type maxsplit,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] split_record_re(
         const column_view& input,
         regex_program prog,
         size_type maxsplit,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef unique_ptr[column] rsplit_record_re(
         const column_view& input,
         regex_program prog,
         size_type maxsplit,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/strings_column_view.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/strings_column_view.pxd
index 8c72fed7219..5fa0dfb4289 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/strings_column_view.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/strings_column_view.pxd
@@ -1,13 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stdint cimport int64_t
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column_view cimport column_view
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 cdef extern from "cudf/strings/strings_column_view.hpp" namespace "cudf" nogil:
     cdef cppclass strings_column_view:
         strings_column_view(column_view) except +libcudf_exception_handler
-        int64_t chars_size(cuda_stream_view) except +libcudf_exception_handler
+        int64_t chars_size(cudaStream_t) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
index 13e017c33f7..4d56b2de5d3 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
@@ -6,7 +6,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.strings.side_type cimport side_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -16,5 +16,5 @@ cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:
         column_view input,
         side_type side,
         string_scalar to_strip,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd
index 21c2fe4a77b..d0b4f192307 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd
@@ -7,7 +7,7 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -17,7 +17,7 @@ cdef extern from "cudf/strings/slice.hpp" namespace "cudf::strings" nogil:
         numeric_scalar[size_type] start,
         numeric_scalar[size_type] end,
         numeric_scalar[size_type] step,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -25,6 +25,6 @@ cdef extern from "cudf/strings/slice.hpp" namespace "cudf::strings" nogil:
         column_view source_strings,
         column_view starts,
         column_view stops,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
index 9bdc0489a89..dcf5aa20948 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from pylibcudf.libcudf.types cimport char_utf8
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -18,7 +18,7 @@ cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] translate(
         column_view input,
         vector[pair[char_utf8, char_utf8]] chars_table,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -31,5 +31,5 @@ cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil:
         vector[pair[char_utf8, char_utf8]] characters_to_filter,
         filter_type keep_characters,
         string_scalar replacement,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
index 8aa5631a12e..2ddd924df48 100644
--- a/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
@@ -5,7 +5,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -14,5 +14,5 @@ cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] wrap(
         column_view input,
         size_type width,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/structs/structs_column_view.pxd b/python/pylibcudf/pylibcudf/libcudf/structs/structs_column_view.pxd
index 7b339782295..d51a51dfb13 100644
--- a/python/pylibcudf/pylibcudf/libcudf/structs/structs_column_view.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/structs/structs_column_view.pxd
@@ -1,6 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column_view cimport column_view
@@ -22,5 +22,5 @@ cdef extern from "cudf/structs/structs_column_view.hpp" namespace "cudf" nogil:
         column_view parent() except +libcudf_exception_handler
         column_view get_sliced_child(
             size_type index,
-            cuda_stream_view stream
+            cudaStream_t stream
         ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/table/table.pxd b/python/pylibcudf/pylibcudf/libcudf/table/table.pxd
index 230131d5520..dcfc046a904 100644
--- a/python/pylibcudf/pylibcudf/libcudf/table/table.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/table/table.pxd
@@ -6,7 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.table.table_view cimport mutable_table_view, table_view
 from pylibcudf.libcudf.types cimport size_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -14,12 +14,12 @@ cdef extern from "cudf/table/table.hpp" namespace "cudf" nogil:
     cdef cppclass table:
         table(
             const table&,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
         table(
             table_view,
-            cuda_stream_view stream,
+            cudaStream_t stream,
             device_async_resource_ref mr
         ) except +libcudf_exception_handler
         size_type num_columns() except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
index 9b2ace2d940..ebc9d8bfa1d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
@@ -16,14 +16,14 @@ from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type
 from pylibcudf.libcudf.types cimport null_aware, output_nullability
 
 from rmm.librmm.device_buffer cimport device_buffer
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
 cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
     cdef pair[unique_ptr[device_buffer], size_type] bools_to_mask (
         const column_view& input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -31,19 +31,19 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         const bitmask_type* bitmask,
         size_type begin_bit,
         size_type end_bit,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef pair[unique_ptr[device_buffer], size_type] nans_to_nulls(
         const column_view& input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] column_nans_to_nulls(
         const column_view& input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
@@ -55,33 +55,33 @@ cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
         optional[void *] user_data,
         null_aware is_null_aware,
         output_nullability null_policy,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef pair[unique_ptr[table], unique_ptr[column]] encode(
         table_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef pair[unique_ptr[column], table_view] one_hot_encode(
         column_view input_column,
         column_view categories,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] compute_column(
         const table_view table,
         const expression& expr,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] compute_column_jit(
         const table_view table,
         const expression& expr,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/transpose.pxd b/python/pylibcudf/pylibcudf/libcudf/transpose.pxd
index 2345ab5a2d9..0ce2048ba0f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/transpose.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/transpose.pxd
@@ -6,7 +6,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.table.table_view cimport table_view
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -16,6 +16,6 @@ cdef extern from "cudf/transpose.hpp" namespace "cudf" nogil:
         table_view
     ] transpose(
         table_view input_table,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/unary.pxd b/python/pylibcudf/pylibcudf/libcudf/unary.pxd
index d3fd2f2f976..6f59ff8d5e0 100644
--- a/python/pylibcudf/pylibcudf/libcudf/unary.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/unary.pxd
@@ -7,7 +7,7 @@ from pylibcudf.exception_handler cimport libcudf_exception_handler
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.types cimport data_type
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 from rmm.librmm.memory_resource cimport device_async_resource_ref
 
 
@@ -42,32 +42,32 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
     cdef extern unique_ptr[column] unary_operation(
         column_view input,
         unary_operator op,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
 
     cdef extern unique_ptr[column] is_null(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef extern unique_ptr[column] is_valid(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef extern unique_ptr[column] cast(
         column_view input,
         data_type out_type,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr) except +libcudf_exception_handler
     cdef extern bool is_supported_cast(data_type from_, data_type to) noexcept
     cdef extern unique_ptr[column] is_nan(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
     cdef extern unique_ptr[column] is_not_nan(
         column_view input,
-        cuda_stream_view stream,
+        cudaStream_t stream,
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/unique_count.pxd b/python/pylibcudf/pylibcudf/libcudf/unique_count.pxd
index 5954dace85e..04001f5a064 100644
--- a/python/pylibcudf/pylibcudf/libcudf/unique_count.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/unique_count.pxd
@@ -9,7 +9,7 @@ from pylibcudf.libcudf.types cimport (
     null_policy,
     size_type,
 )
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 cdef extern from "cudf/reduction/unique_count.hpp" namespace "cudf" nogil:
@@ -17,9 +17,9 @@ cdef extern from "cudf/reduction/unique_count.hpp" namespace "cudf" nogil:
         column_view column,
         null_policy null_handling,
         nan_policy nan_handling,
-        cuda_stream_view stream) except +libcudf_exception_handler
+        cudaStream_t stream) except +libcudf_exception_handler
 
     cdef size_type unique_count(
         table_view source_table,
         null_equality nulls_equal,
-        cuda_stream_view stream) except +libcudf_exception_handler
+        cudaStream_t stream) except +libcudf_exception_handler
diff --git a/python/pylibcudf/pylibcudf/libcudf/utilities/default_stream.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/default_stream.pxd
index a9569f11706..661db24f5aa 100644
--- a/python/pylibcudf/pylibcudf/libcudf/utilities/default_stream.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/utilities/default_stream.pxd
@@ -1,10 +1,9 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
+from cuda.bindings.cyruntime cimport cudaStream_t
 from libcpp cimport bool
 
-from rmm.librmm.cuda_stream_view cimport cuda_stream_view
-
 
 cdef extern from "cudf/utilities/default_stream.hpp" namespace "cudf" nogil:
     cdef bool is_ptds_enabled()
-    cdef cuda_stream_view get_default_stream()
+    cdef cudaStream_t get_default_stream()
diff --git a/python/pylibcudf/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd
index be47db18a59..88b09c01531 100644
--- a/python/pylibcudf/pylibcudf/lists.pxd
+++ b/python/pylibcudf/pylibcudf/lists.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
@@ -9,7 +9,6 @@ from pylibcudf.libcudf.copying cimport out_of_bounds_policy
 from pylibcudf.libcudf.lists.combine cimport concatenate_null_policy
 from pylibcudf.libcudf.lists.contains cimport duplicate_find_option
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 from .column cimport Column
 from .scalar cimport Scalar
@@ -26,33 +25,33 @@ ctypedef fused ColumnOrSizeType:
 cpdef Table explode_outer(
     Table,
     size_type explode_column_idx,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column concatenate_rows(
     Table,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column concatenate_list_elements(
     Column,
     concatenate_null_policy null_policy,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column contains(
     Column,
     ColumnOrScalar,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column contains_nulls(
     Column,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -60,13 +59,13 @@ cpdef Column index_of(
     Column,
     ColumnOrScalar,
     duplicate_find_option,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column reverse(
     Column,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -74,20 +73,20 @@ cpdef Column segmented_gather(
     Column,
     Column,
     out_of_bounds_policy bounds_policy=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column extract_list_element(
     Column,
     ColumnOrSizeType,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column count_elements(
     Column,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -95,7 +94,7 @@ cpdef Column sequences(
     Column,
     Column,
     Column steps = *,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -104,7 +103,7 @@ cpdef Column sort_lists(
     order,
     null_order,
     bool stable = *,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -113,7 +112,7 @@ cpdef Column difference_distinct(
     Column,
     null_equality nulls_equal=*,
     nan_equality nans_equal=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -122,7 +121,7 @@ cpdef Column have_overlap(
     Column,
     null_equality nulls_equal=*,
     nan_equality nans_equal=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -131,7 +130,7 @@ cpdef Column intersect_distinct(
     Column,
     null_equality nulls_equal=*,
     nan_equality nans_equal=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -140,14 +139,14 @@ cpdef Column union_distinct(
     Column,
     null_equality nulls_equal=*,
     nan_equality nans_equal=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column apply_boolean_mask(
     Column,
     Column,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -155,6 +154,6 @@ cpdef Column distinct(
     Column,
     null_equality,
     nan_equality,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/lists.pyi b/python/pylibcudf/pylibcudf/lists.pyi
index a3bcf9f76d6..1e418b59726 100644
--- a/python/pylibcudf/pylibcudf/lists.pyi
+++ b/python/pylibcudf/pylibcudf/lists.pyi
@@ -1,16 +1,16 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.copying import OutOfBoundsPolicy
 from pylibcudf.scalar import Scalar
 from pylibcudf.table import Table
 from pylibcudf.types import NanEquality, NullEquality, NullOrder, Order
+from pylibcudf.utils import CudaStreamLike
 
 class ConcatenateNullPolicy(IntEnum):
     IGNORE = ...
@@ -23,66 +23,66 @@ class DuplicateFindOption(IntEnum):
 def explode_outer(
     input: Table,
     explode_column_idx: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def concatenate_rows(
     input: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def concatenate_list_elements(
     input: Column,
     null_policy: ConcatenateNullPolicy,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def contains(
     input: Column,
     search_key: Column | Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def contains_nulls(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def index_of(
     input: Column,
     search_key: Column | Scalar,
     find_option: DuplicateFindOption,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def reverse(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def segmented_gather(
     input: Column,
     gather_map_list: Column,
     bounds_policy: OutOfBoundsPolicy = OutOfBoundsPolicy.DONT_CHECK,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def extract_list_element(
     input: Column,
     index: Column | int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def count_elements(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def sequences(
     starts: Column,
     sizes: Column,
     steps: Column | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def sort_lists(
@@ -90,7 +90,7 @@ def sort_lists(
     sort_order: Order,
     na_position: NullOrder,
     stable: bool = False,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def difference_distinct(
@@ -98,7 +98,7 @@ def difference_distinct(
     rhs: Column,
     nulls_equal: NullEquality = NullEquality.EQUAL,
     nans_equal: NanEquality = NanEquality.ALL_EQUAL,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def have_overlap(
@@ -106,7 +106,7 @@ def have_overlap(
     rhs: Column,
     nulls_equal: NullEquality = NullEquality.EQUAL,
     nans_equal: NanEquality = NanEquality.ALL_EQUAL,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def intersect_distinct(
@@ -114,7 +114,7 @@ def intersect_distinct(
     rhs: Column,
     nulls_equal: NullEquality = NullEquality.EQUAL,
     nans_equal: NanEquality = NanEquality.ALL_EQUAL,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def union_distinct(
@@ -122,19 +122,19 @@ def union_distinct(
     rhs: Column,
     nulls_equal: NullEquality = NullEquality.EQUAL,
     nans_equal: NanEquality = NanEquality.ALL_EQUAL,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def apply_boolean_mask(
     input: Column,
     boolean_mask: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def distinct(
     input: Column,
     nulls_equal: NullEquality,
     nans_equal: NanEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
index 0076f7da677..fd05242e44f 100644
--- a/python/pylibcudf/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -55,6 +55,7 @@ from .column cimport Column, ListsColumnView
 from .scalar cimport Scalar
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "ConcatenateNullPolicy",
@@ -82,7 +83,7 @@ __all__ = [
 cpdef Table explode_outer(
     Table input,
     size_type explode_column_idx,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Explode a column of lists into rows.
@@ -105,20 +106,21 @@ cpdef Table explode_outer(
     """
     cdef unique_ptr[table] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_explode.explode_outer(
-            input.view(), explode_column_idx, stream.view(), mr.get_mr()
+            input.view(), explode_column_idx, _cs, mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column concatenate_rows(
     Table input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Concatenate multiple lists columns into a single lists column row-wise.
@@ -139,21 +141,22 @@ cpdef Column concatenate_rows(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_concatenate_rows(
-            input.view(), concatenate_null_policy.IGNORE, stream.view(), mr.get_mr()
+            input.view(), concatenate_null_policy.IGNORE, _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column concatenate_list_elements(
     Column input,
     concatenate_null_policy null_policy,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Concatenate multiple lists on the same row into a single list.
@@ -174,21 +177,22 @@ cpdef Column concatenate_list_elements(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_concatenate_list_elements(
-            input.view(), null_policy, stream.view(), mr.get_mr()
+            input.view(), null_policy, _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column contains(
     Column input,
     ColumnOrScalar search_key,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a column of bool values indicating whether
@@ -218,7 +222,8 @@ cpdef Column contains(
     cdef unique_ptr[column] c_result
     cdef ListsColumnView list_view = input.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if not isinstance(search_key, (Column, Scalar)):
@@ -230,15 +235,15 @@ cpdef Column contains(
             search_key.view() if ColumnOrScalar is Column else dereference(
                 search_key.get()
             ),
-            stream.view(),
+            _cs,
             mr.get_mr(),
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column contains_nulls(
     Column input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a column of bool values indicating whether
@@ -262,21 +267,22 @@ cpdef Column contains_nulls(
     cdef unique_ptr[column] c_result
     cdef ListsColumnView list_view = input.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_contains.contains_nulls(
-            list_view.view(), stream.view(), mr.get_mr()
+            list_view.view(), _cs, mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column index_of(
     Column input,
     ColumnOrScalar search_key,
     duplicate_find_option find_option,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a column of index values indicating the position of a search
@@ -307,7 +313,8 @@ cpdef Column index_of(
     cdef unique_ptr[column] c_result
     cdef ListsColumnView list_view = input.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -317,15 +324,15 @@ cpdef Column index_of(
                 search_key.get()
             ),
             find_option,
-            stream.view(),
+            _cs,
             mr.get_mr(),
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column reverse(
     Column input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Reverse the element order within each list of the input column.
@@ -347,19 +354,20 @@ cpdef Column reverse(
     cdef unique_ptr[column] c_result
     cdef ListsColumnView list_view = input.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_reverse.reverse(list_view.view(), stream.view(), mr.get_mr())
-    return Column.from_libcudf(move(c_result), stream, mr)
+        c_result = cpp_reverse.reverse(list_view.view(), _cs, mr.get_mr())
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column segmented_gather(
     Column input,
     Column gather_map_list,
     out_of_bounds_policy bounds_policy=out_of_bounds_policy.DONT_CHECK,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a column with elements gathered based on the indices in gather_map_list
@@ -394,7 +402,8 @@ cpdef Column segmented_gather(
     cdef ListsColumnView list_view1 = input.list_view()
     cdef ListsColumnView list_view2 = gather_map_list.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -402,16 +411,16 @@ cpdef Column segmented_gather(
             list_view1.view(),
             list_view2.view(),
             bounds_policy,
-            stream.view(),
+            _cs,
             mr.get_mr(),
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column extract_list_element(
     Column input,
     ColumnOrSizeType index,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a column of extracted list elements.
@@ -433,22 +442,23 @@ cpdef Column extract_list_element(
     cdef unique_ptr[column] c_result
     cdef ListsColumnView list_view = input.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_extract_list_element(
             list_view.view(),
             index.view() if ColumnOrSizeType is Column else index,
-            stream.view(),
+            _cs,
             mr.get_mr(),
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column count_elements(
     Column input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Count the number of rows in each
@@ -472,20 +482,21 @@ cpdef Column count_elements(
     cdef ListsColumnView list_view = input.list_view()
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_count_elements(list_view.view(), stream.view(), mr.get_mr())
+        c_result = cpp_count_elements(list_view.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column sequences(
     Column starts,
     Column sizes,
     Column steps = None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a lists column in which each row contains a sequence of
@@ -509,7 +520,8 @@ cpdef Column sequences(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if steps is not None:
@@ -518,22 +530,22 @@ cpdef Column sequences(
                 starts.view(),
                 steps.view(),
                 sizes.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr(),
             )
     else:
         with nogil:
             c_result = cpp_filling.sequences(
-                starts.view(), sizes.view(), stream.view(), mr.get_mr()
+                starts.view(), sizes.view(), _cs, mr.get_mr()
             )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column sort_lists(
     Column input,
     order sort_order,
     null_order na_position,
     bool stable = False,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Sort the elements within a list in each row of a list column.
@@ -561,7 +573,8 @@ cpdef Column sort_lists(
     cdef unique_ptr[column] c_result
     cdef ListsColumnView list_view = input.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -570,7 +583,7 @@ cpdef Column sort_lists(
                     list_view.view(),
                     sort_order,
                     na_position,
-                    stream.view(),
+                    _cs,
                     mr.get_mr(),
             )
         else:
@@ -578,10 +591,10 @@ cpdef Column sort_lists(
                     list_view.view(),
                     sort_order,
                     na_position,
-                    stream.view(),
+                    _cs,
                     mr.get_mr(),
             )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column difference_distinct(
@@ -589,7 +602,7 @@ cpdef Column difference_distinct(
     Column rhs,
     null_equality nulls_equal=null_equality.EQUAL,
     nan_equality nans_equal=nan_equality.ALL_EQUAL,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a column of index values indicating the position of a search
@@ -617,7 +630,8 @@ cpdef Column difference_distinct(
     cdef ListsColumnView lhs_view = lhs.list_view()
     cdef ListsColumnView rhs_view = rhs.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -626,10 +640,10 @@ cpdef Column difference_distinct(
             rhs_view.view(),
             nulls_equal,
             nans_equal,
-            stream.view(),
+            _cs,
             mr.get_mr(),
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column have_overlap(
@@ -637,7 +651,7 @@ cpdef Column have_overlap(
     Column rhs,
     null_equality nulls_equal=null_equality.EQUAL,
     nan_equality nans_equal=nan_equality.ALL_EQUAL,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Check if lists at each row of the given lists columns overlap.
@@ -664,7 +678,8 @@ cpdef Column have_overlap(
     cdef ListsColumnView lhs_view = lhs.list_view()
     cdef ListsColumnView rhs_view = rhs.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -673,10 +688,10 @@ cpdef Column have_overlap(
             rhs_view.view(),
             nulls_equal,
             nans_equal,
-            stream.view(),
+            _cs,
             mr.get_mr(),
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column intersect_distinct(
@@ -684,7 +699,7 @@ cpdef Column intersect_distinct(
     Column rhs,
     null_equality nulls_equal=null_equality.EQUAL,
     nan_equality nans_equal=nan_equality.ALL_EQUAL,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a lists column of distinct elements common to two input lists columns.
@@ -711,7 +726,8 @@ cpdef Column intersect_distinct(
     cdef ListsColumnView lhs_view = lhs.list_view()
     cdef ListsColumnView rhs_view = rhs.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -720,10 +736,10 @@ cpdef Column intersect_distinct(
             rhs_view.view(),
             nulls_equal,
             nans_equal,
-            stream.view(),
+            _cs,
             mr.get_mr(),
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column union_distinct(
@@ -731,7 +747,7 @@ cpdef Column union_distinct(
     Column rhs,
     null_equality nulls_equal=null_equality.EQUAL,
     nan_equality nans_equal=nan_equality.ALL_EQUAL,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a lists column of distinct elements found in
@@ -759,7 +775,8 @@ cpdef Column union_distinct(
     cdef ListsColumnView lhs_view = lhs.list_view()
     cdef ListsColumnView rhs_view = rhs.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -768,16 +785,16 @@ cpdef Column union_distinct(
             rhs_view.view(),
             nulls_equal,
             nans_equal,
-            stream.view(),
+            _cs,
             mr.get_mr(),
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column apply_boolean_mask(
     Column input,
     Column boolean_mask,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Filters elements in each row of the input lists column using a boolean mask
@@ -802,24 +819,25 @@ cpdef Column apply_boolean_mask(
     cdef ListsColumnView list_view = input.list_view()
     cdef ListsColumnView mask_view = boolean_mask.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_apply_boolean_mask(
             list_view.view(),
             mask_view.view(),
-            stream.view(),
+            _cs,
             mr.get_mr(),
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column distinct(
     Column input,
     null_equality nulls_equal,
     nan_equality nans_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a new list column without duplicate elements in each list.
@@ -843,7 +861,8 @@ cpdef Column distinct(
     cdef unique_ptr[column] c_result
     cdef ListsColumnView list_view = input.list_view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -852,10 +871,10 @@ cpdef Column distinct(
             nulls_equal,
             nans_equal,
             duplicate_keep_option.KEEP_ANY,
-            stream.view(),
+            _cs,
             mr.get_mr(),
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 ConcatenateNullPolicy.__str__ = ConcatenateNullPolicy.__repr__
 DuplicateFindOption.__str__ = DuplicateFindOption.__repr__
diff --git a/python/pylibcudf/pylibcudf/merge.pxd b/python/pylibcudf/pylibcudf/merge.pxd
index aed9dda7479..07624852289 100644
--- a/python/pylibcudf/pylibcudf/merge.pxd
+++ b/python/pylibcudf/pylibcudf/merge.pxd
@@ -1,9 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from .table cimport Table
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 
@@ -12,6 +11,6 @@ cpdef Table merge (
     list key_cols,
     list column_order,
     list null_precedence,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/merge.pyi b/python/pylibcudf/pylibcudf/merge.pyi
index f96e1d8534e..50e87d5bffa 100644
--- a/python/pylibcudf/pylibcudf/merge.pyi
+++ b/python/pylibcudf/pylibcudf/merge.pyi
@@ -1,17 +1,17 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.table import Table
 from pylibcudf.types import NullOrder, Order
+from pylibcudf.utils import CudaStreamLike
 
 def merge(
     tables_to_merge: list[Table],
     key_cols: list[int],
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx
index a6cbaf81051..3c0cd93a342 100644
--- a/python/pylibcudf/pylibcudf/merge.pyx
+++ b/python/pylibcudf/pylibcudf/merge.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -14,6 +14,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["merge"]
 
@@ -22,7 +23,7 @@ cpdef Table merge (
     list key_cols,
     list column_order,
     list null_precedence,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Merge a set of sorted tables.
@@ -58,7 +59,8 @@ cpdef Table merge (
         c_tables_to_merge.push_back((<Table?> tbl).view())
 
     cdef unique_ptr[table] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -67,7 +69,7 @@ cpdef Table merge (
             c_key_cols,
             c_column_order,
             c_null_precedence,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd
index 6eb10eddb2e..e7fa70e23ae 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/null_mask.pxd
@@ -5,18 +5,19 @@ from pylibcudf.libcudf.types cimport mask_state, size_type
 
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 from .column cimport Column
 
 
-cpdef DeviceBuffer copy_bitmask(Column col, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef DeviceBuffer copy_bitmask(
+    Column col, object stream = *, DeviceMemoryResource mr=*
+)
 
 cpdef DeviceBuffer copy_bitmask_from_bitmask(
     object bitmask,
     size_type begin_bit,
     size_type end_bit,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
@@ -25,24 +26,24 @@ cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits)
 cpdef DeviceBuffer create_null_mask(
     size_type size,
     mask_state state=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
-cpdef tuple bitmask_and(list columns, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef tuple bitmask_and(list columns, object stream = *, DeviceMemoryResource mr=*)
 
-cpdef tuple bitmask_or(list columns, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef tuple bitmask_or(list columns, object stream = *, DeviceMemoryResource mr=*)
 
 cpdef size_type null_count(
     object bitmask,
     size_type start,
     size_type stop,
-    Stream stream=*
+    object stream = *
 )
 
 cpdef size_type index_of_first_set_bit(
     object bitmask,
     size_type start,
     size_type stop,
-    Stream stream=*
+    object stream = *
 )
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyi b/python/pylibcudf/pylibcudf/null_mask.pyi
index 98f6e60fb0d..45e130b704e 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pyi
+++ b/python/pylibcudf/pylibcudf/null_mask.pyi
@@ -3,44 +3,44 @@
 
 from rmm.pylibrmm.device_buffer import DeviceBuffer
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.span import Span
 from pylibcudf.types import MaskState
+from pylibcudf.utils import CudaStreamLike
 
 def copy_bitmask(
     col: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> DeviceBuffer: ...
 def copy_bitmask_from_bitmask(
     bitmask: Span,
     begin_bit: int,
     end_bit: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> DeviceBuffer: ...
 def bitmask_allocation_size_bytes(number_of_bits: int) -> int: ...
 def create_null_mask(
     size: int,
     state: MaskState = MaskState.UNINITIALIZED,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> DeviceBuffer: ...
 def bitmask_and(
     columns: list[Column],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[DeviceBuffer, int]: ...
 def bitmask_or(
     columns: list[Column],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[DeviceBuffer, int]: ...
 def null_count(
-    bitmask: Span, start: int, stop: int, stream: Stream | None = None
+    bitmask: Span, start: int, stop: int, stream: CudaStreamLike | None = None
 ) -> int: ...
 def index_of_first_set_bit(
-    bitmask: Span, start: int, stop: int, stream: Stream | None = None
+    bitmask: Span, start: int, stop: int, stream: CudaStreamLike | None = None
 ) -> int: ...
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx
index 176e73047e2..164c51aca9f 100644
--- a/python/pylibcudf/pylibcudf/null_mask.pyx
+++ b/python/pylibcudf/pylibcudf/null_mask.pyx
@@ -19,6 +19,7 @@ from .span import is_span as py_is_span
 from .column cimport Column
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "bitmask_allocation_size_bytes",
@@ -31,7 +32,7 @@ __all__ = [
 ]
 
 cdef DeviceBuffer buffer_to_python(
-    device_buffer buf, Stream stream, DeviceMemoryResource mr
+    device_buffer buf, object stream, DeviceMemoryResource mr
 ):
     return DeviceBuffer.c_from_unique_ptr(
         make_unique[device_buffer](move(buf)), stream, mr
@@ -40,7 +41,7 @@ cdef DeviceBuffer buffer_to_python(
 
 cpdef DeviceBuffer copy_bitmask(
     Column col,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Copies ``col``'s bitmask into a ``DeviceBuffer``.
@@ -63,20 +64,21 @@ cpdef DeviceBuffer copy_bitmask(
         ``DeviceBuffer`` if ``col`` is not nullable
     """
     cdef device_buffer db
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        db = cpp_null_mask.copy_bitmask(col.view(), stream.view(), mr.get_mr())
+        db = cpp_null_mask.copy_bitmask(col.view(), _cs, mr.get_mr())
 
-    return buffer_to_python(move(db), stream, mr)
+    return buffer_to_python(move(db), _stream, mr)
 
 
 cpdef DeviceBuffer copy_bitmask_from_bitmask(
     object bitmask,
     size_type begin_bit,
     size_type end_bit,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Copies a portion of a bitmask into a ``DeviceBuffer``.
@@ -108,7 +110,8 @@ cpdef DeviceBuffer copy_bitmask_from_bitmask(
             f"got {type(bitmask).__name__}"
         )
     cdef device_buffer db
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     cdef uintptr_t ptr = bitmask.ptr
 
@@ -117,11 +120,11 @@ cpdef DeviceBuffer copy_bitmask_from_bitmask(
             <bitmask_type*>ptr,
             begin_bit,
             end_bit,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return buffer_to_python(move(db), stream, mr)
+    return buffer_to_python(move(db), _stream, mr)
 
 
 cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits):
@@ -148,7 +151,7 @@ cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits):
 cpdef DeviceBuffer create_null_mask(
     size_type size,
     mask_state state = mask_state.UNINITIALIZED,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Creates a ``DeviceBuffer`` for use as a null value indicator bitmask of a
@@ -176,16 +179,17 @@ cpdef DeviceBuffer create_null_mask(
         state
     """
     cdef device_buffer db
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        db = cpp_null_mask.create_null_mask(size, state, stream.view(), mr.get_mr())
+        db = cpp_null_mask.create_null_mask(size, state, _cs, mr.get_mr())
 
-    return buffer_to_python(move(db), stream, mr)
+    return buffer_to_python(move(db), _stream, mr)
 
 
-cpdef tuple bitmask_and(list columns, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef tuple bitmask_and(list columns, object stream=None, DeviceMemoryResource mr=None):
     """Performs bitwise AND of the bitmasks of a list of columns.
 
     For details, see :cpp:func:`bitmask_and`.
@@ -206,16 +210,19 @@ cpdef tuple bitmask_and(list columns, Stream stream=None, DeviceMemoryResource m
     """
     cdef Table c_table = Table(columns)
     cdef pair[device_buffer, size_type] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_null_mask.bitmask_and(c_table.view(), stream.view(), mr.get_mr())
+        c_result = cpp_null_mask.bitmask_and(
+            c_table.view(), _cs, mr.get_mr()
+        )
 
-    return buffer_to_python(move(c_result.first), stream, mr), c_result.second
+    return buffer_to_python(move(c_result.first), _stream, mr), c_result.second
 
 
-cpdef tuple bitmask_or(list columns, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef tuple bitmask_or(list columns, object stream=None, DeviceMemoryResource mr=None):
     """Performs bitwise OR of the bitmasks of a list of columns.
 
     For details, see :cpp:func:`bitmask_or`.
@@ -236,20 +243,21 @@ cpdef tuple bitmask_or(list columns, Stream stream=None, DeviceMemoryResource mr
     """
     cdef Table c_table = Table(columns)
     cdef pair[device_buffer, size_type] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_null_mask.bitmask_or(c_table.view(), stream.view(), mr.get_mr())
+        c_result = cpp_null_mask.bitmask_or(c_table.view(), _cs, mr.get_mr())
 
-    return buffer_to_python(move(c_result.first), stream, mr), c_result.second
+    return buffer_to_python(move(c_result.first), _stream, mr), c_result.second
 
 
 cpdef size_type null_count(
     object bitmask,
     size_type start,
     size_type stop,
-    Stream stream=None
+    object stream=None
 ):
     """Given a validity bitmask, counts the number of null elements.
 
@@ -277,20 +285,21 @@ cpdef size_type null_count(
             f"got {type(bitmask).__name__}"
         )
     cdef uintptr_t ptr = bitmask.ptr
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     with nogil:
         return cpp_null_mask.null_count(
             <bitmask_type*>ptr,
             start,
             stop,
-            stream.view()
+            _cs
         )
 
 cpdef size_type index_of_first_set_bit(
     object bitmask,
     size_type start,
     size_type stop,
-    Stream stream=None
+    object stream=None
 ):
     """Given a validity bitmask, returns the index of the first valid element
     relative to ``start``.
@@ -319,11 +328,12 @@ cpdef size_type index_of_first_set_bit(
             f"got {type(bitmask).__name__}"
         )
     cdef uintptr_t ptr = bitmask.ptr
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     with nogil:
         return cpp_null_mask.index_of_first_set_bit(
             <bitmask_type*>ptr,
             start,
             stop,
-            stream.view()
+            _cs
         )
diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd
index 8cd73fe41ad..2bc3f75b174 100644
--- a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -6,7 +6,6 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.nvtext.byte_pair_encode cimport bpe_merge_pairs
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cdef class BPEMergePairs:
@@ -16,6 +15,6 @@ cpdef Column byte_pair_encoding(
     Column input,
     BPEMergePairs merge_pairs,
     Scalar separator=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi
index 4abf1f52b4d..7ee48f72209 100644
--- a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyi
@@ -1,17 +1,17 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 class BPEMergePairs:
     def __init__(
         self,
         merge_pairs: Column,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ): ...
 
@@ -19,6 +19,6 @@ def byte_pair_encoding(
     input: Column,
     merge_pairs: BPEMergePairs,
     separator: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
index 001b9dfca1e..023e00a1169 100644
--- a/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/byte_pair_encode.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -19,6 +19,7 @@ from pylibcudf.scalar cimport Scalar
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["BPEMergePairs", "byte_pair_encoding"]
 
@@ -30,14 +31,17 @@ cdef class BPEMergePairs:
     def __cinit__(
         self,
         Column merge_pairs,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None
     ):
         cdef column_view c_pairs = merge_pairs.view()
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
         with nogil:
-            self.c_obj = move(cpp_load_merge_pairs(c_pairs, stream.view(), mr.get_mr()))
+            self.c_obj = move(
+                cpp_load_merge_pairs(c_pairs, _cs, mr.get_mr())
+            )
 
     __hash__ = None
 
@@ -45,7 +49,7 @@ cpdef Column byte_pair_encoding(
     Column input,
     BPEMergePairs merge_pairs,
     Scalar separator=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -70,12 +74,13 @@ cpdef Column byte_pair_encoding(
         An encoded column of strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if separator is None:
         separator = Scalar.from_libcudf(
-            cpp_make_string_scalar(" ".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar(" ".encode(), _stream.view().value(), mr.get_mr())
         )
 
     with nogil:
@@ -84,9 +89,9 @@ cpdef Column byte_pair_encoding(
                 input.view(),
                 dereference(merge_pairs.c_obj.get()),
                 dereference(<const string_scalar*>separator.c_obj.get()),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/nvtext/deduplicate.pxd b/python/pylibcudf/pylibcudf/nvtext/deduplicate.pxd
index ecca0a495a1..d038d4a3e27 100644
--- a/python/pylibcudf/pylibcudf/nvtext/deduplicate.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/deduplicate.pxd
@@ -1,22 +1,21 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.types cimport size_type
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 cpdef Column build_suffix_array(
     Column input,
     size_type min_width,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 cpdef Column resolve_duplicates(
     Column input,
     Column indices,
     size_type min_width,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 cpdef Column resolve_duplicates_pair(
@@ -25,6 +24,6 @@ cpdef Column resolve_duplicates_pair(
     Column input2,
     Column indices2,
     size_type min_width,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/deduplicate.pyi b/python/pylibcudf/pylibcudf/nvtext/deduplicate.pyi
index 6e3d6883df4..653ee588f61 100644
--- a/python/pylibcudf/pylibcudf/nvtext/deduplicate.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/deduplicate.pyi
@@ -1,22 +1,22 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def build_suffix_array(
     input: Column,
     min_width: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def resolve_duplicates(
     input: Column,
     indices: Column,
     min_width: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def resolve_duplicates_pair(
@@ -25,6 +25,6 @@ def resolve_duplicates_pair(
     input2: Column,
     indices2: Column,
     min_width: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/deduplicate.pyx b/python/pylibcudf/pylibcudf/nvtext/deduplicate.pyx
index c71ae479674..e679841a792 100644
--- a/python/pylibcudf/pylibcudf/nvtext/deduplicate.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/deduplicate.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator import dereference
@@ -18,6 +18,7 @@ from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.librmm.device_buffer cimport device_buffer
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "build_suffix_array",
@@ -36,14 +37,12 @@ cdef Column _column_from_suffix_array(
                 device_buffer(),
                 0
             )
-        ),
-        stream,
-        mr
+        ), stream, mr
     )
 
 
 cpdef Column build_suffix_array(
-    Column input, size_type min_width, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, size_type min_width, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Builds a suffix array for the input strings column.
@@ -68,22 +67,23 @@ cpdef Column build_suffix_array(
         New column of suffix array
     """
     cdef cpp_suffix_array_type c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_build_suffix_array(
-            input.view(), min_width, stream.view(), mr.get_mr()
+            input.view(), min_width, _cs, mr.get_mr()
         )
 
-    return _column_from_suffix_array(move(c_result), stream, mr)
+    return _column_from_suffix_array(move(c_result), _stream, mr)
 
 
 cpdef Column resolve_duplicates(
     Column input,
     Column indices,
     size_type min_width,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -111,15 +111,16 @@ cpdef Column resolve_duplicates(
         New column of duplicate strings
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_resolve_duplicates(
-            input.view(), indices.view(), min_width, stream.view(), mr.get_mr()
+            input.view(), indices.view(), min_width, _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column resolve_duplicates_pair(
@@ -128,7 +129,7 @@ cpdef Column resolve_duplicates_pair(
     Column input2,
     Column indices2,
     size_type min_width,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -161,7 +162,8 @@ cpdef Column resolve_duplicates_pair(
 
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -171,8 +173,8 @@ cpdef Column resolve_duplicates_pair(
             input2.view(),
             indices2.view(),
             min_width,
-            stream.view(),
+            _cs,
             mr.get_mr(),
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd
index aca87ac4882..c0297ebd887 100644
--- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pxd
@@ -1,20 +1,19 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column edit_distance(
     Column input,
     Column targets,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column edit_distance_matrix(
     Column input,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi
index 8c0e97b9951..5a6bde4cb66 100644
--- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyi
@@ -1,19 +1,19 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def edit_distance(
     input: Column,
     targets: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def edit_distance_matrix(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
index 14d3b4539dc..4b9d3f6bcc3 100644
--- a/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/edit_distance.pyx
@@ -17,13 +17,14 @@ from rmm.pylibrmm.stream cimport Stream
 
 from ..column cimport Column
 from ..utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["edit_distance", "edit_distance_matrix"]
 
 cpdef Column edit_distance(
     Column input,
     Column targets,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -48,18 +49,19 @@ cpdef Column edit_distance(
     cdef column_view c_strings = input.view()
     cdef column_view c_targets = targets.view()
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_edit_distance(c_strings, c_targets, stream.view(), mr.get_mr())
+        c_result = cpp_edit_distance(c_strings, c_targets, _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column edit_distance_matrix(
     Column input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -88,10 +90,11 @@ cpdef Column edit_distance_matrix(
     )
     cdef column_view c_strings = input.view()
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_edit_distance_matrix(c_strings, stream.view(), mr.get_mr())
+        c_result = cpp_edit_distance_matrix(c_strings, _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
index 1eb55f1fcf6..85477223954 100644
--- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stdint cimport uint32_t
@@ -6,21 +6,20 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column generate_ngrams(
     Column input,
     size_type ngrams,
     Scalar separator,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column generate_character_ngrams(
     Column input,
     size_type ngrams=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -28,6 +27,6 @@ cpdef Column hash_character_ngrams(
     Column input,
     size_type ngrams,
     uint32_t seed,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi
index 7a522acc5a9..317fdb9ee73 100644
--- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyi
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from typing import Any
@@ -6,28 +6,28 @@ from typing import Any
 import numpy as np
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 def generate_ngrams(
     input: Column,
     ngrams: int,
     separator: Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def generate_character_ngrams(
     input: Column,
     ngrams: int = 2,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def hash_character_ngrams(
     input: Column,
     ngrams: int,
     seed: int | np.unsignedinteger[Any],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
index ca8a21c279c..6d70751a5a0 100644
--- a/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/generate_ngrams.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stdint cimport uint32_t
@@ -18,6 +18,7 @@ from pylibcudf.scalar cimport Scalar
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "generate_ngrams",
@@ -29,7 +30,7 @@ cpdef Column generate_ngrams(
     Column input,
     size_type ngrams,
     Scalar separator,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -56,7 +57,8 @@ cpdef Column generate_ngrams(
     cdef column_view c_strings = input.view()
     cdef const string_scalar* c_separator = <const string_scalar*>separator.c_obj.get()
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -64,16 +66,16 @@ cpdef Column generate_ngrams(
             c_strings,
             ngrams,
             c_separator[0],
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column generate_character_ngrams(
     Column input,
     size_type ngrams = 2,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -97,24 +99,25 @@ cpdef Column generate_character_ngrams(
     """
     cdef column_view c_strings = input.view()
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_generate_character_ngrams(
             c_strings,
             ngrams,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column hash_character_ngrams(
     Column input,
     size_type ngrams,
     uint32_t seed,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -140,7 +143,8 @@ cpdef Column hash_character_ngrams(
     """
     cdef column_view c_strings = input.view()
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -148,7 +152,7 @@ cpdef Column hash_character_ngrams(
             c_strings,
             ngrams,
             seed,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
index fbf8e99ac55..1e3a26454a1 100644
--- a/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pxd
@@ -1,16 +1,15 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.types cimport size_type
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column jaccard_index(
     Column input1,
     Column input2,
     size_type width,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi
index abc86597c0e..355d2d7a92f 100644
--- a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyi
@@ -1,15 +1,15 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def jaccard_index(
     input1: Column,
     input2: Column,
     width: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
index 4089853ca77..24a343e4508 100644
--- a/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/jaccard.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -13,6 +13,7 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["jaccard_index"]
 
@@ -20,7 +21,7 @@ cpdef Column jaccard_index(
     Column input1,
     Column input2,
     size_type width,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -49,7 +50,8 @@ cpdef Column jaccard_index(
     cdef column_view c_input1 = input1.view()
     cdef column_view c_input2 = input2.view()
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -57,8 +59,8 @@ cpdef Column jaccard_index(
             c_input1,
             c_input2,
             width,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
index 0647337324d..f26b1e30245 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stdint cimport uint32_t, uint64_t
@@ -6,7 +6,6 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 ctypedef fused ColumnOrScalar:
     Column
@@ -18,7 +17,7 @@ cpdef Column minhash(
     Column a,
     Column b,
     size_type width,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
@@ -28,7 +27,7 @@ cpdef Column minhash64(
     Column a,
     Column b,
     size_type width,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
@@ -38,7 +37,7 @@ cpdef Column minhash_ngrams(
     uint32_t seed,
     Column a,
     Column b,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
@@ -48,6 +47,6 @@ cpdef Column minhash64_ngrams(
     uint64_t seed,
     Column a,
     Column b,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
index ee924f8d7aa..5bce73dc991 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyi
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from typing import Any
@@ -6,9 +6,9 @@ from typing import Any
 import numpy as np
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def minhash(
     input: Column,
@@ -16,7 +16,7 @@ def minhash(
     a: Column,
     b: Column,
     width: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def minhash64(
@@ -25,7 +25,7 @@ def minhash64(
     a: Column,
     b: Column,
     width: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def minhash_ngrams(
@@ -34,7 +34,7 @@ def minhash_ngrams(
     seed: int | np.unsignedinteger[Any],
     a: Column,
     b: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def minhash64_ngrams(
@@ -43,6 +43,6 @@ def minhash64_ngrams(
     seed: int | np.unsignedinteger[Any],
     a: Column,
     b: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index 1329d88060c..3029ed54c50 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stdint cimport uint32_t, uint64_t
@@ -16,6 +16,7 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "minhash",
@@ -30,7 +31,7 @@ cpdef Column minhash(
     Column a,
     Column b,
     size_type width,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """
@@ -58,7 +59,8 @@ cpdef Column minhash(
         List column of minhash values for each string per seed
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -68,11 +70,11 @@ cpdef Column minhash(
             a.view(),
             b.view(),
             width,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column minhash64(
     Column input,
@@ -80,7 +82,7 @@ cpdef Column minhash64(
     Column a,
     Column b,
     size_type width,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """
@@ -110,7 +112,8 @@ cpdef Column minhash64(
         List column of minhash values for each string per seed
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -120,11 +123,11 @@ cpdef Column minhash64(
             a.view(),
             b.view(),
             width,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column minhash_ngrams(
     Column input,
@@ -132,7 +135,7 @@ cpdef Column minhash_ngrams(
     uint32_t seed,
     Column a,
     Column b,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """
@@ -163,7 +166,8 @@ cpdef Column minhash_ngrams(
         value in columns a and b.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -173,11 +177,11 @@ cpdef Column minhash_ngrams(
             seed,
             a.view(),
             b.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column minhash64_ngrams(
     Column input,
@@ -185,7 +189,7 @@ cpdef Column minhash64_ngrams(
     uint64_t seed,
     Column a,
     Column b,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """
@@ -216,7 +220,8 @@ cpdef Column minhash64_ngrams(
         value in columns a and b.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -226,8 +231,8 @@ cpdef Column minhash64_ngrams(
             seed,
             a.view(),
             b.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd
index f410d778cb1..5deaa45c73f 100644
--- a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pxd
@@ -1,11 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column ngrams_tokenize(
@@ -13,6 +12,6 @@ cpdef Column ngrams_tokenize(
     size_type ngrams,
     Scalar delimiter,
     Scalar separator,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi
index 1347b7e7087..99c309a21ff 100644
--- a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyi
@@ -1,17 +1,17 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 def ngrams_tokenize(
     input: Column,
     ngrams: int,
     delimiter: Scalar,
     separator: Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
index f9f36244a1d..959c47d595d 100644
--- a/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/ngrams_tokenize.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -15,6 +15,7 @@ from pylibcudf.scalar cimport Scalar
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["ngrams_tokenize"]
 
@@ -23,7 +24,7 @@ cpdef Column ngrams_tokenize(
     size_type ngrams,
     Scalar delimiter,
     Scalar separator,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -52,7 +53,8 @@ cpdef Column ngrams_tokenize(
         New strings columns of tokens
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -61,7 +63,7 @@ cpdef Column ngrams_tokenize(
             ngrams,
             dereference(<const string_scalar*>delimiter.get()),
             dereference(<const string_scalar*>separator.get()),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
index 8c8623e07a3..30e459f75a5 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
@@ -6,16 +6,17 @@ from libcpp.memory cimport unique_ptr
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.nvtext.normalize cimport character_normalizer
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 cdef class CharacterNormalizer:
     cdef unique_ptr[character_normalizer] c_obj
 
-cpdef Column normalize_spaces(Column input, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef Column normalize_spaces(
+    Column input, object stream = *, DeviceMemoryResource mr=*
+)
 
 cpdef Column normalize_characters(
   Column input,
   CharacterNormalizer normalizer,
-  Stream stream=*,
+  object stream = *,
   DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
index 958adb10ada..0fbd2e7e725 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyi
@@ -1,28 +1,28 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 class CharacterNormalizer:
     def __init__(
         self,
         do_lower_case: bool,
         special_tokens: Column,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ): ...
 
 def normalize_spaces(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def normalize_characters(
     input: Column,
     normalizer: CharacterNormalizer,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
index 5f62189f2f5..8e29aad9121 100644
--- a/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/normalize.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -12,6 +12,7 @@ from pylibcudf.libcudf.nvtext cimport normalize as cpp_normalize
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "CharacterNormalizer"
@@ -28,18 +29,19 @@ cdef class CharacterNormalizer:
         self,
         bool do_lower_case,
         Column tokens,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None
     ):
         cdef column_view c_tokens = tokens.view()
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
         with nogil:
             self.c_obj = move(
                 cpp_normalize.create_character_normalizer(
                     do_lower_case,
                     c_tokens,
-                    stream.view(),
+                    _cs,
                     mr.get_mr()
                 )
             )
@@ -47,7 +49,7 @@ cdef class CharacterNormalizer:
     __hash__ = None
 
 cpdef Column normalize_spaces(
-    Column input, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a new strings column by normalizing the whitespace in
@@ -68,21 +70,22 @@ cpdef Column normalize_spaces(
         New strings columns of normalized strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_normalize.normalize_spaces(
-            input.view(), stream.view(), mr.get_mr()
+            input.view(), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column normalize_characters(
     Column input,
     CharacterNormalizer normalizer,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -105,15 +108,16 @@ cpdef Column normalize_characters(
         Normalized strings column
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_normalize.normalize_characters(
             input.view(),
             dereference(normalizer.c_obj.get()),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pxd b/python/pylibcudf/pylibcudf/nvtext/replace.pxd
index c6a9ed5ba67..1265f75a514 100644
--- a/python/pylibcudf/pylibcudf/nvtext/replace.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pxd
@@ -1,11 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column replace_tokens(
@@ -13,7 +12,7 @@ cpdef Column replace_tokens(
     Column targets,
     Column replacements,
     Scalar delimiter=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -22,6 +21,6 @@ cpdef Column filter_tokens(
     size_type min_token_length,
     Scalar replacement=*,
     Scalar delimiter=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyi b/python/pylibcudf/pylibcudf/nvtext/replace.pyi
index 09187c1edf1..a5e451cdb16 100644
--- a/python/pylibcudf/pylibcudf/nvtext/replace.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyi
@@ -1,18 +1,18 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 def replace_tokens(
     input: Column,
     targets: Column,
     replacements: Column,
     delimiter: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def filter_tokens(
@@ -20,6 +20,6 @@ def filter_tokens(
     min_token_length: int,
     replacement: Scalar | None = None,
     delimiter: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/replace.pyx b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
index db375e6993f..4b00d76bd64 100644
--- a/python/pylibcudf/pylibcudf/nvtext/replace.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/replace.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -19,6 +19,7 @@ from pylibcudf.scalar cimport Scalar
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["filter_tokens", "replace_tokens"]
 
@@ -27,7 +28,7 @@ cpdef Column replace_tokens(
     Column targets,
     Column replacements,
     Scalar delimiter=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -55,11 +56,12 @@ cpdef Column replace_tokens(
         New strings column with replaced strings
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     if delimiter is None:
         delimiter = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
     with nogil:
         c_result = cpp_replace_tokens(
@@ -67,10 +69,10 @@ cpdef Column replace_tokens(
             targets.view(),
             replacements.view(),
             dereference(<const string_scalar*>delimiter.get()),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column filter_tokens(
@@ -78,7 +80,7 @@ cpdef Column filter_tokens(
     size_type min_token_length,
     Scalar replacement=None,
     Scalar delimiter=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -107,15 +109,16 @@ cpdef Column filter_tokens(
         New strings column of filtered strings
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     if delimiter is None:
         delimiter = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
     if replacement is None:
         replacement = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
 
     with nogil:
@@ -124,8 +127,8 @@ cpdef Column filter_tokens(
             min_token_length,
             dereference(<const string_scalar*>replacement.get()),
             dereference(<const string_scalar*>delimiter.get()),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
index 0b19c699ea8..d9f9ef1549c 100644
--- a/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
@@ -6,7 +6,6 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.nvtext.stemmer cimport letter_type
 from pylibcudf.libcudf.types cimport size_type
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 ctypedef fused ColumnOrSize:
     Column
@@ -16,10 +15,10 @@ cpdef Column is_letter(
     Column input,
     bool check_vowels,
     ColumnOrSize indices,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column porter_stemmer_measure(
-    Column input, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, object stream = *, DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi
index ae53ce887a4..5fef689a895 100644
--- a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyi
@@ -1,20 +1,20 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def is_letter(
     input: Column,
     check_vowels: bool,
     indices: Column | int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def porter_stemmer_measure(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
index 44dc6be5c60..e038cd03fb2 100644
--- a/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/stemmer.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
@@ -18,6 +18,7 @@ from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from pylibcudf.libcudf.nvtext.stemmer import letter_type as LetterType # no-cython-lint
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["is_letter", "porter_stemmer_measure", "LetterType"]
 
@@ -25,7 +26,7 @@ cpdef Column is_letter(
     Column input,
     bool check_vowels,
     ColumnOrSize indices,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -55,7 +56,8 @@ cpdef Column is_letter(
         New boolean column.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -63,14 +65,14 @@ cpdef Column is_letter(
             input.view(),
             letter_type.VOWEL if check_vowels else letter_type.CONSONANT,
             indices if ColumnOrSize is size_type else indices.view(),
-            stream.view()
+            _cs
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column porter_stemmer_measure(
-    Column input, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns the Porter Stemmer measurements of a strings column.
@@ -92,12 +94,13 @@ cpdef Column porter_stemmer_measure(
         New column of measure values
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_porter_stemmer_measure(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_porter_stemmer_measure(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 LetterType.__str__ = LetterType.__repr__
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
index 2ad694d1eca..8346d420440 100644
--- a/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -7,36 +7,35 @@ from pylibcudf.libcudf.nvtext.tokenize cimport tokenize_vocabulary
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 cdef class TokenizeVocabulary:
     cdef unique_ptr[tokenize_vocabulary] c_obj
 
 cpdef Column tokenize_scalar(
-    Column input, Scalar delimiter=*, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, Scalar delimiter=*, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column tokenize_column(
-    Column input, Column delimiters, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, Column delimiters, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column count_tokens_scalar(
-    Column input, Scalar delimiter=*, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, Scalar delimiter=*, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column count_tokens_column(
-    Column input, Column delimiters, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, Column delimiters, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column character_tokenize(
-    Column input, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column detokenize(
     Column input,
     Column row_indices,
     Scalar separator=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -45,6 +44,6 @@ cpdef Column tokenize_with_vocabulary(
     TokenizeVocabulary vocabulary,
     Scalar delimiter,
     size_type default_id=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi
index c6e2d4cfcb4..72a5209902e 100644
--- a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyi
@@ -1,54 +1,54 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 class TokenizeVocabulary:
     def __init__(
         self,
         vocab: Column,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ): ...
 
 def tokenize_scalar(
     input: Column,
     delimiter: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def tokenize_column(
     input: Column,
     delimiters: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def count_tokens_scalar(
     input: Column,
     delimiter: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def count_tokens_column(
     input: Column,
     delimiters: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def character_tokenize(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def detokenize(
     input: Column,
     row_indices: Column,
     separator: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def tokenize_with_vocabulary(
@@ -56,6 +56,6 @@ def tokenize_with_vocabulary(
     vocabulary: TokenizeVocabulary,
     delimiter: Scalar,
     default_id: int = -1,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
index e296ea38a58..4e44d781cc4 100644
--- a/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/tokenize.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -24,6 +24,7 @@ from pylibcudf.scalar cimport Scalar
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "TokenizeVocabulary",
@@ -41,19 +42,20 @@ cdef class TokenizeVocabulary:
 
     For details, see :cpp:class:`cudf::nvtext::tokenize_vocabulary`.
     """
-    def __cinit__(self, Column vocab, Stream stream=None, DeviceMemoryResource mr=None):
+    def __cinit__(self, Column vocab, object stream=None, DeviceMemoryResource mr=None):
         cdef column_view c_vocab = vocab.view()
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
         with nogil:
-            self.c_obj = move(cpp_load_vocabulary(c_vocab, stream.view(), mr.get_mr()))
+            self.c_obj = move(cpp_load_vocabulary(c_vocab, _cs, mr.get_mr()))
 
     __hash__ = None
 
 cpdef Column tokenize_scalar(
     Column input,
     Scalar delimiter=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -77,26 +79,27 @@ cpdef Column tokenize_scalar(
         New strings columns of tokens
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if delimiter is None:
         delimiter = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
 
     with nogil:
         c_result = cpp_tokenize(
             input.view(),
             dereference(<const string_scalar*>delimiter.c_obj.get()),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column tokenize_column(
-    Column input, Column delimiters, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, Column delimiters, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a single column of strings by tokenizing the input
@@ -119,23 +122,24 @@ cpdef Column tokenize_column(
         New strings columns of tokens
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_tokenize(
             input.view(),
             delimiters.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column count_tokens_scalar(
     Column input,
     Scalar delimiter=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -159,26 +163,27 @@ cpdef Column count_tokens_scalar(
         New column of token counts
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if delimiter is None:
         delimiter = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
 
     with nogil:
         c_result = cpp_count_tokens(
             input.view(),
             dereference(<const string_scalar*>delimiter.c_obj.get()),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column count_tokens_column(
-    Column input, Column delimiters, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, Column delimiters, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns the number of tokens in each string of a strings column
@@ -201,21 +206,22 @@ cpdef Column count_tokens_column(
         New column of token counts
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_count_tokens(
             input.view(),
             delimiters.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column character_tokenize(
-    Column input, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a single column of strings by converting
@@ -236,18 +242,19 @@ cpdef Column character_tokenize(
         New strings columns of tokens
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
-        c_result = cpp_character_tokenize(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_character_tokenize(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column detokenize(
     Column input,
     Column row_indices,
     Scalar separator=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -273,12 +280,13 @@ cpdef Column detokenize(
         New strings columns of tokens
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if separator is None:
         separator = Scalar.from_libcudf(
-            cpp_make_string_scalar(" ".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar(" ".encode(), _stream.view().value(), mr.get_mr())
         )
 
     with nogil:
@@ -286,18 +294,18 @@ cpdef Column detokenize(
             input.view(),
             row_indices.view(),
             dereference(<const string_scalar*>separator.c_obj.get()),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column tokenize_with_vocabulary(
     Column input,
     TokenizeVocabulary vocabulary,
     Scalar delimiter,
     size_type default_id=-1,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -325,7 +333,8 @@ cpdef Column tokenize_with_vocabulary(
         Lists column of token ids
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -334,8 +343,8 @@ cpdef Column tokenize_with_vocabulary(
             dereference(vocabulary.c_obj.get()),
             dereference(<const string_scalar*>delimiter.c_obj.get()),
             default_id,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd
index 3f7685903e0..604a566c701 100644
--- a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -6,7 +6,6 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.nvtext.wordpiece_tokenize cimport wordpiece_vocabulary
 from pylibcudf.libcudf.types cimport size_type
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 cdef class WordPieceVocabulary:
     cdef unique_ptr[wordpiece_vocabulary] c_obj
@@ -15,6 +14,6 @@ cpdef Column wordpiece_tokenize(
     Column input,
     WordPieceVocabulary vocabulary,
     size_type max_words_per_row,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi
index e91cfc8f21e..e77a8c86a69 100644
--- a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi
+++ b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyi
@@ -1,16 +1,16 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 class WordPieceVocabulary:
     def __init__(
         self,
         vocab: Column,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ): ...
 
@@ -18,6 +18,6 @@ def wordpiece_tokenize(
     input: Column,
     vocabulary: WordPieceVocabulary,
     max_words_per_row: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx
index b6c516cf739..dfdb563087d 100644
--- a/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/wordpiece_tokenize.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -15,6 +15,7 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "WordPieceVocabulary",
@@ -29,15 +30,16 @@ cdef class WordPieceVocabulary:
     def __cinit__(
         self,
         Column vocab,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None
     ):
         cdef column_view c_vocab = vocab.view()
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
         with nogil:
             self.c_obj = move(cpp_load_wordpiece_vocabulary(
-                c_vocab, stream.view(), mr.get_mr()
+                c_vocab, _cs, mr.get_mr()
             ))
 
     __hash__ = None
@@ -46,7 +48,7 @@ cpdef Column wordpiece_tokenize(
     Column input,
     WordPieceVocabulary vocabulary,
     size_type max_words_per_row,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -73,7 +75,8 @@ cpdef Column wordpiece_tokenize(
         Lists column of token ids
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -81,8 +84,8 @@ cpdef Column wordpiece_tokenize(
             input.view(),
             dereference(vocabulary.c_obj.get()),
             max_words_per_row,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/partitioning.pxd
index 096b4eb99e8..84c9b647691 100644
--- a/python/pylibcudf/pylibcudf/partitioning.pxd
+++ b/python/pylibcudf/pylibcudf/partitioning.pxd
@@ -1,7 +1,6 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from libc.stdint cimport uint32_t
@@ -20,7 +19,7 @@ cpdef tuple[Table, list] hash_partition(
     int num_partitions,
     hash_id hash_function = *,
     uint32_t seed = *,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -28,7 +27,7 @@ cpdef tuple[Table, list] partition(
     Table t,
     Column partition_map,
     int num_partitions,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -36,6 +35,6 @@ cpdef tuple[Table, list] round_robin_partition(
     Table input,
     int num_partitions,
     int start_partition=*,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
diff --git a/python/pylibcudf/pylibcudf/partitioning.pyi b/python/pylibcudf/pylibcudf/partitioning.pyi
index 081ee53731f..971346421ea 100644
--- a/python/pylibcudf/pylibcudf/partitioning.pyi
+++ b/python/pylibcudf/pylibcudf/partitioning.pyi
@@ -4,10 +4,10 @@
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 class HashId(IntEnum):
     HASH_IDENTITY = ...
@@ -19,20 +19,20 @@ def hash_partition(
     num_partitions: int,
     hash_function: HashId = ...,
     seed: int = ...,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Table, list[int]]: ...
 def partition(
     t: Table,
     partition_map: Column,
     num_partitions: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Table, list[int]]: ...
 def round_robin_partition(
     input: Table,
     num_partitions: int,
     start_partition: int = 0,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Table, list[int]]: ...
diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx
index b8da9249656..62e35ab9cca 100644
--- a/python/pylibcudf/pylibcudf/partitioning.pyx
+++ b/python/pylibcudf/pylibcudf/partitioning.pyx
@@ -15,6 +15,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from .column cimport Column
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 __all__ = [
@@ -29,7 +30,7 @@ cpdef tuple[Table, list] hash_partition(
     int num_partitions,
     cpp_partitioning.hash_id hash_function = cpp_partitioning.hash_id.HASH_MURMUR3,
     uint32_t seed = cpp_partitioning.DEFAULT_HASH_SEED,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -63,7 +64,8 @@ cpdef tuple[Table, list] hash_partition(
     cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
     cdef int c_num_partitions = num_partitions
     cdef vector[libcudf_types.size_type] columns_to_hash
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     if TableOrList is Table:
         with nogil:
@@ -73,7 +75,7 @@ cpdef tuple[Table, list] hash_partition(
                 c_num_partitions,
                 hash_function,
                 seed,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
@@ -85,17 +87,17 @@ cpdef tuple[Table, list] hash_partition(
                 c_num_partitions,
                 hash_function,
                 seed,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
-    return Table.from_libcudf(move(c_result.first), stream, mr), list(c_result.second)
+    return Table.from_libcudf(move(c_result.first), _stream, mr), list(c_result.second)
 
 
 cpdef tuple[Table, list] partition(
     Table t,
     Column partition_map,
     int num_partitions,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -126,7 +128,8 @@ cpdef tuple[Table, list] partition(
     cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result
     cdef int c_num_partitions = num_partitions
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -134,18 +137,18 @@ cpdef tuple[Table, list] partition(
             t.view(),
             partition_map.view(),
             c_num_partitions,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result.first), stream, mr), list(c_result.second)
+    return Table.from_libcudf(move(c_result.first), _stream, mr), list(c_result.second)
 
 
 cpdef tuple[Table, list] round_robin_partition(
     Table input,
     int num_partitions,
     int start_partition=0,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -176,7 +179,8 @@ cpdef tuple[Table, list] round_robin_partition(
     cdef int c_num_partitions = num_partitions
     cdef int c_start_partition = start_partition
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -184,8 +188,8 @@ cpdef tuple[Table, list] round_robin_partition(
             input.view(),
             c_num_partitions,
             c_start_partition,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result.first), stream, mr), list(c_result.second)
+    return Table.from_libcudf(move(c_result.first), _stream, mr), list(c_result.second)
diff --git a/python/pylibcudf/pylibcudf/quantiles.pxd b/python/pylibcudf/pylibcudf/quantiles.pxd
index 9492ef8ce38..668e8015688 100644
--- a/python/pylibcudf/pylibcudf/quantiles.pxd
+++ b/python/pylibcudf/pylibcudf/quantiles.pxd
@@ -1,9 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp.vector cimport vector
 from pylibcudf.libcudf.types cimport interpolation, sorted
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 from .column cimport Column
 from .table cimport Table
@@ -15,7 +14,7 @@ cpdef Column quantile(
     interpolation interp = *,
     Column ordered_indices = *,
     bint exact = *,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -26,6 +25,6 @@ cpdef Table quantiles(
     sorted is_input_sorted = *,
     list column_order = *,
     list null_precedence = *,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
diff --git a/python/pylibcudf/pylibcudf/quantiles.pyi b/python/pylibcudf/pylibcudf/quantiles.pyi
index 2e414357651..9af646407ab 100644
--- a/python/pylibcudf/pylibcudf/quantiles.pyi
+++ b/python/pylibcudf/pylibcudf/quantiles.pyi
@@ -1,14 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from collections.abc import Iterable
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.table import Table
 from pylibcudf.types import Interpolation, NullOrder, Order, Sorted
+from pylibcudf.utils import CudaStreamLike
 
 def quantile(
     input: Column,
@@ -16,7 +16,7 @@ def quantile(
     interp: Interpolation = Interpolation.LINEAR,
     ordered_indices: Column | None = None,
     exact: bool = True,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def quantiles(
@@ -26,6 +26,6 @@ def quantiles(
     is_input_sorted: Sorted = Sorted.NO,
     column_order: list[Order] | None = None,
     null_precedence: list[NullOrder] | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx
index de1ee3344d3..f02643754cb 100644
--- a/python/pylibcudf/pylibcudf/quantiles.pyx
+++ b/python/pylibcudf/pylibcudf/quantiles.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
@@ -20,6 +20,7 @@ from .column cimport Column
 from .table cimport Table
 from .types cimport interpolation
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["quantile", "quantiles"]
 
@@ -29,7 +30,7 @@ cpdef Column quantile(
     interpolation interp = interpolation.LINEAR,
     Column ordered_indices = None,
     bool exact=True,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes quantiles with interpolation.
@@ -74,7 +75,8 @@ cpdef Column quantile(
     else:
         ordered_indices_view = ordered_indices.view()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -84,11 +86,11 @@ cpdef Column quantile(
             interp,
             ordered_indices_view,
             exact,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table quantiles(
@@ -98,7 +100,7 @@ cpdef Table quantiles(
     sorted is_input_sorted = sorted.NO,
     list column_order = None,
     list null_precedence = None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes row quantiles with interpolation.
@@ -156,7 +158,8 @@ cpdef Table quantiles(
     if null_precedence is not None:
         null_precedence_vec = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -167,8 +170,8 @@ cpdef Table quantiles(
             is_input_sorted,
             column_order_vec,
             null_precedence_vec,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/reduce.pxd b/python/pylibcudf/pylibcudf/reduce.pxd
index e9acd2aaed5..dc33d7053f4 100644
--- a/python/pylibcudf/pylibcudf/reduce.pxd
+++ b/python/pylibcudf/pylibcudf/reduce.pxd
@@ -4,7 +4,6 @@
 from libcpp cimport bool
 from pylibcudf.libcudf.reduce cimport scan_type
 from pylibcudf.libcudf.types cimport nan_policy, null_policy, size_type
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .aggregation cimport Aggregation
@@ -18,7 +17,7 @@ cpdef Scalar reduce(
     Aggregation agg,
     DataType data_type,
     Scalar init = *,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -26,11 +25,11 @@ cpdef Column scan(
     Column col,
     Aggregation agg,
     scan_type inclusive,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
-cpdef tuple minmax(Column col, Stream stream = *, DeviceMemoryResource mr = *)
+cpdef tuple minmax(Column col, object stream = *, DeviceMemoryResource mr = *)
 
 cpdef bool is_valid_reduce_aggregation(DataType source, Aggregation agg)
 
@@ -38,12 +37,12 @@ cpdef size_type unique_count(
     Column source,
     null_policy null_handling,
     nan_policy nan_handling,
-    Stream stream = *
+    object stream = *
 )
 
 cpdef size_type distinct_count(
     Column source,
     null_policy null_handling,
     nan_policy nan_handling,
-    Stream stream = *
+    object stream = *
 )
diff --git a/python/pylibcudf/pylibcudf/reduce.pyi b/python/pylibcudf/pylibcudf/reduce.pyi
index 5956b93661c..9e1c643b0cd 100644
--- a/python/pylibcudf/pylibcudf/reduce.pyi
+++ b/python/pylibcudf/pylibcudf/reduce.pyi
@@ -4,12 +4,12 @@
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.aggregation import Aggregation
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.types import DataType, NanPolicy, NullPolicy
+from pylibcudf.utils import CudaStreamLike
 
 class ScanType(IntEnum):
     INCLUSIVE = ...
@@ -19,19 +19,19 @@ def reduce(
     col: Column,
     agg: Aggregation,
     data_type: DataType,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Scalar: ...
 def scan(
     col: Column,
     agg: Aggregation,
     inclusive: ScanType,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def minmax(
     col: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Scalar, Scalar]: ...
 def is_valid_reduce_aggregation(
@@ -41,11 +41,11 @@ def unique_count(
     source: Column,
     null_handling: NullPolicy,
     nan_handling: NanPolicy,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
 ) -> int: ...
 def distinct_count(
     source: Column,
     null_handling: NullPolicy,
     nan_handling: NanPolicy,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
 ) -> int: ...
diff --git a/python/pylibcudf/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx
index 54036b73e85..95c3555d021 100644
--- a/python/pylibcudf/pylibcudf/reduce.pyx
+++ b/python/pylibcudf/pylibcudf/reduce.pyx
@@ -31,6 +31,7 @@ from .types cimport DataType
 from .utils cimport _get_stream, _get_memory_resource
 
 from pylibcudf.libcudf.reduce import scan_type as ScanType  # no-cython-lint
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "ScanType",
@@ -47,7 +48,7 @@ cpdef Scalar reduce(
     Aggregation agg,
     DataType data_type,
     Scalar init=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a reduction on a column
@@ -79,7 +80,8 @@ cpdef Scalar reduce(
     cdef optional[reference_wrapper[constscalar]] c_init
     cdef const scalar* c_init_ptr
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if init is not None:
@@ -96,7 +98,7 @@ cpdef Scalar reduce(
             dereference(c_agg),
             data_type.c_obj,
             c_init,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
     return Scalar.from_libcudf(move(result))
@@ -106,7 +108,7 @@ cpdef Column scan(
     Column col,
     Aggregation agg,
     scan_type inclusive,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a scan on a column
@@ -134,7 +136,8 @@ cpdef Column scan(
     cdef unique_ptr[column] result
     cdef const scan_aggregation *c_agg = agg.view_underlying_as_scan()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -143,13 +146,13 @@ cpdef Column scan(
             dereference(c_agg),
             inclusive,
             null_policy.EXCLUDE,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
-cpdef tuple minmax(Column col, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef tuple minmax(Column col, object stream=None, DeviceMemoryResource mr=None):
     """Compute the minimum and maximum of a column
 
     For details, see ``cudf::minmax`` documentation.
@@ -173,11 +176,12 @@ cpdef tuple minmax(Column col, Stream stream=None, DeviceMemoryResource mr=None)
     cdef Scalar min_scalar
     cdef Scalar max_scalar
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_minmax(col.view(), stream.view(), mr.get_mr())
+        result = cpp_minmax(col.view(), _cs, mr.get_mr())
 
     min_scalar = Scalar.from_libcudf(move(result.first))
     max_scalar = Scalar.from_libcudf(move(result.second))
@@ -206,7 +210,7 @@ cpdef size_type unique_count(
     Column source,
     null_policy null_handling,
     nan_policy nan_handling,
-    Stream stream=None
+    object stream=None
 ):
     """Returns the number of unique consecutive elements in the input column.
 
@@ -231,10 +235,10 @@ cpdef size_type unique_count(
     If the input column is sorted, then unique_count can produce the
     same result as distinct_count, but faster.
     """
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
 
     return cpp_unique_count.unique_count(
-        source.view(), null_handling, nan_handling, stream.view()
+        source.view(), null_handling, nan_handling, _stream.view().value()
     )
 
 
@@ -242,7 +246,7 @@ cpdef size_type distinct_count(
     Column source,
     null_policy null_handling,
     nan_policy nan_handling,
-    Stream stream=None
+    object stream=None
 ):
     """Returns the number of distinct elements in the input column.
 
@@ -262,10 +266,10 @@ cpdef size_type distinct_count(
     size_type
         The number of distinct elements in the input column.
     """
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
 
     return cpp_distinct_count.distinct_count(
-        source.view(), null_handling, nan_handling, stream.view()
+        source.view(), null_handling, nan_handling, _stream.view().value()
     )
 
 
diff --git a/python/pylibcudf/pylibcudf/replace.pxd b/python/pylibcudf/pylibcudf/replace.pxd
index 49b57753eb1..7e78e92d514 100644
--- a/python/pylibcudf/pylibcudf/replace.pxd
+++ b/python/pylibcudf/pylibcudf/replace.pxd
@@ -1,9 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
 from pylibcudf.libcudf.replace cimport replace_policy
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .column cimport Column
@@ -22,7 +21,7 @@ ctypedef fused ReplacementType:
 cpdef Column replace_nulls(
     Column source_column,
     ReplacementType replacement,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -30,7 +29,7 @@ cpdef Column find_and_replace_all(
     Column source_column,
     Column values_to_replace,
     Column replacement_values,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -40,13 +39,13 @@ cpdef Column clamp(
     Scalar hi,
     Scalar lo_replace=*,
     Scalar hi_replace=*,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef Column normalize_nans_and_zeros(
     Column source_column,
     bool inplace=*,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
diff --git a/python/pylibcudf/pylibcudf/replace.pyi b/python/pylibcudf/pylibcudf/replace.pyi
index d7a35721769..f74e06c3909 100644
--- a/python/pylibcudf/pylibcudf/replace.pyi
+++ b/python/pylibcudf/pylibcudf/replace.pyi
@@ -1,13 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 class ReplacePolicy(IntEnum):
     PRECEDING = ...
@@ -16,14 +16,14 @@ class ReplacePolicy(IntEnum):
 def replace_nulls(
     source_column: Column,
     replacement: Column | Scalar | ReplacePolicy,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def find_and_replace_all(
     source_column: Column,
     values_to_replace: Column,
     replacement_values: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def clamp(
@@ -32,12 +32,12 @@ def clamp(
     hi: Scalar,
     lo_replace: Scalar | None = None,
     hi_replace: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def normalize_nans_and_zeros(
     source_column: Column,
     inplace: bool = False,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx
index c3730e3971f..4a5cc162551 100644
--- a/python/pylibcudf/pylibcudf/replace.pyx
+++ b/python/pylibcudf/pylibcudf/replace.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 
@@ -18,6 +18,7 @@ from pylibcudf.libcudf.replace import \
 from .column cimport Column
 from .scalar cimport Scalar
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "ReplacePolicy",
@@ -31,7 +32,7 @@ __all__ = [
 cpdef Column replace_nulls(
     Column source_column,
     ReplacementType replacement,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Replace nulls in source_column.
@@ -70,7 +71,8 @@ cpdef Column replace_nulls(
     cdef unique_ptr[column] c_result
     cdef replace_policy policy
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     # Due to https://github.com/cython/cython/issues/5984, if this function is
@@ -84,10 +86,10 @@ cpdef Column replace_nulls(
                 c_result = cpp_replace.replace_nulls(
                     source_column.view(),
                     policy,
-                    stream.view(),
+                    _cs,
                     mr.get_mr()
                 )
-            return Column.from_libcudf(move(c_result), stream, mr)
+            return Column.from_libcudf(move(c_result), _stream, mr)
         else:
             raise TypeError("replacement must be a Column, Scalar, or replace_policy")
 
@@ -96,33 +98,33 @@ cpdef Column replace_nulls(
             c_result = cpp_replace.replace_nulls(
                 source_column.view(),
                 replacement.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         elif ReplacementType is Scalar:
             c_result = cpp_replace.replace_nulls(
                 source_column.view(),
                 dereference(replacement.c_obj),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         elif ReplacementType is replace_policy:
             c_result = cpp_replace.replace_nulls(
                 source_column.view(),
                 replacement,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         else:
             assert False, "Internal error. Please contact pylibcudf developers"
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column find_and_replace_all(
     Column source_column,
     Column values_to_replace,
     Column replacement_values,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Replace all occurrences of values_to_replace with replacement_values.
@@ -150,7 +152,8 @@ cpdef Column find_and_replace_all(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -158,10 +161,10 @@ cpdef Column find_and_replace_all(
             source_column.view(),
             values_to_replace.view(),
             replacement_values.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column clamp(
@@ -170,7 +173,7 @@ cpdef Column clamp(
     Scalar hi,
     Scalar lo_replace=None,
     Scalar hi_replace=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Clamp the values in source_column to the range [lo, hi].
@@ -206,7 +209,8 @@ cpdef Column clamp(
 
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -215,7 +219,7 @@ cpdef Column clamp(
                 source_column.view(),
                 dereference(lo.c_obj),
                 dereference(hi.c_obj),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         else:
@@ -225,16 +229,16 @@ cpdef Column clamp(
                 dereference(lo_replace.c_obj),
                 dereference(hi.c_obj),
                 dereference(hi_replace.c_obj),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column normalize_nans_and_zeros(
     Column source_column,
     bool inplace=False,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Normalize NaNs and zeros in source_column.
@@ -260,24 +264,25 @@ cpdef Column normalize_nans_and_zeros(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         if inplace:
             cpp_replace.normalize_nans_and_zeros(
                 source_column.mutable_view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         else:
             c_result = cpp_replace.normalize_nans_and_zeros(
                 source_column.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
 
     if not inplace:
-        return Column.from_libcudf(move(c_result), stream, mr)
+        return Column.from_libcudf(move(c_result), _stream, mr)
 
 ReplacePolicy.__str__ = ReplacePolicy.__repr__
diff --git a/python/pylibcudf/pylibcudf/reshape.pxd b/python/pylibcudf/pylibcudf/reshape.pxd
index fd2eb9f31ec..09a111770b5 100644
--- a/python/pylibcudf/pylibcudf/reshape.pxd
+++ b/python/pylibcudf/pylibcudf/reshape.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stddef cimport size_t
@@ -6,7 +6,6 @@ from libc.stdint cimport uintptr_t
 
 from pylibcudf.libcudf.types cimport size_type
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.device_buffer cimport DeviceBuffer
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
@@ -17,17 +16,17 @@ from .types cimport DataType
 
 
 cpdef Column interleave_columns(
-    Table source_table, Stream stream=*, DeviceMemoryResource mr=*
+    Table source_table, object stream = *, DeviceMemoryResource mr=*
 )
 cpdef Table tile(
     Table source_table,
     size_type count,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 cpdef void table_to_array(
     Table input_table,
     uintptr_t ptr,
     size_t size,
-    Stream stream=*
+    object stream = *
 )
diff --git a/python/pylibcudf/pylibcudf/reshape.pyi b/python/pylibcudf/pylibcudf/reshape.pyi
index c8ca83be981..03acda18353 100644
--- a/python/pylibcudf/pylibcudf/reshape.pyi
+++ b/python/pylibcudf/pylibcudf/reshape.pyi
@@ -1,26 +1,26 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 def interleave_columns(
     source_table: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def tile(
     source_table: Table,
     count: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def table_to_array(
     input_table: Table,
     ptr: int,
     size: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
 ) -> None: ...
diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx
index b001b289794..a81dadf62ce 100644
--- a/python/pylibcudf/pylibcudf/reshape.pyx
+++ b/python/pylibcudf/pylibcudf/reshape.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libc.stddef cimport size_t
@@ -24,11 +24,12 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from .column cimport Column
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["interleave_columns", "tile", "table_to_array"]
 
 cpdef Column interleave_columns(
-    Table source_table, Stream stream=None, DeviceMemoryResource mr=None
+    Table source_table, object stream=None, DeviceMemoryResource mr=None
 ):
     """Interleave columns of a table into a single column.
 
@@ -55,21 +56,22 @@ cpdef Column interleave_columns(
         A new column which is the result of interleaving the input columns
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_interleave_columns(
-            source_table.view(), stream.view(), mr.get_mr()
+            source_table.view(), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table tile(
     Table source_table,
     size_type count,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Repeats the rows from input table count times to form a new table.
@@ -93,22 +95,23 @@ cpdef Table tile(
         The table containing the tiled "rows"
     """
     cdef unique_ptr[table] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_tile(
-            source_table.view(), count, stream.view(), mr.get_mr()
+            source_table.view(), count, _cs, mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef void table_to_array(
     Table input_table,
     uintptr_t ptr,
     size_t size,
-    Stream stream=None
+    object stream=None
 ):
     """
     Copy a table into a preallocated column-major device array.
@@ -129,7 +132,8 @@ cpdef void table_to_array(
         raise ValueError(
             "Size exceeds the size_t limit."
         )
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
 
     cdef device_span[byte] span = device_span[byte](
         <byte*> ptr, size
@@ -139,5 +143,5 @@ cpdef void table_to_array(
         cpp_table_to_array(
             input_table.view(),
             span,
-            stream.view()
+            _cs
         )
diff --git a/python/pylibcudf/pylibcudf/rolling.pxd b/python/pylibcudf/pylibcudf/rolling.pxd
index 5ea7dc747f4..94a6a8a6d89 100644
--- a/python/pylibcudf/pylibcudf/rolling.pxd
+++ b/python/pylibcudf/pylibcudf/rolling.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
@@ -8,7 +8,6 @@ from pylibcudf.libcudf.rolling cimport (
     bounded_closed, bounded_open, current_row, rolling_request, unbounded
 )
 from pylibcudf.libcudf.types cimport null_order, order, size_type
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .aggregation cimport Aggregation
@@ -63,7 +62,7 @@ cpdef Table grouped_range_rolling_window(
     PrecedingRangeWindowType preceding,
     FollowingRangeWindowType following,
     list requests,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -73,7 +72,7 @@ cpdef Column rolling_window(
     WindowType following_window,
     size_type min_periods,
     Aggregation agg,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -86,6 +85,6 @@ cpdef tuple make_range_windows(
     null_order null_order,
     PrecedingRangeWindowType preceding,
     FollowingRangeWindowType following,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
diff --git a/python/pylibcudf/pylibcudf/rolling.pyi b/python/pylibcudf/pylibcudf/rolling.pyi
index 239ce9ddbd8..883f62d0d3f 100644
--- a/python/pylibcudf/pylibcudf/rolling.pyi
+++ b/python/pylibcudf/pylibcudf/rolling.pyi
@@ -1,14 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.aggregation import Aggregation
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.table import Table
 from pylibcudf.types import DataType, NullOrder, Order
+from pylibcudf.utils import CudaStreamLike
 
 class Unbounded: ...
 class CurrentRow: ...
@@ -36,7 +36,7 @@ def grouped_range_rolling_window(
     preceding: RangeWindowType,
     following: RangeWindowType,
     requests: list[RollingRequest],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def rolling_window[WindowType: (Column, int)](
@@ -45,7 +45,7 @@ def rolling_window[WindowType: (Column, int)](
     following_window: WindowType,
     min_periods: int,
     agg: Aggregation,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_valid_rolling_aggregation(
@@ -58,6 +58,6 @@ def make_range_windows(
     null_order: NullOrder,
     preceding: RangeWindowType,
     following: RangeWindowType,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Column, Column]: ...
diff --git a/python/pylibcudf/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx
index 73c10e53d57..ae9d7665d69 100644
--- a/python/pylibcudf/pylibcudf/rolling.pyx
+++ b/python/pylibcudf/pylibcudf/rolling.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -20,6 +20,7 @@ from .column cimport Column
 from .scalar cimport Scalar
 from .types cimport DataType
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 
 __all__ = [
@@ -125,7 +126,7 @@ cpdef Table grouped_range_rolling_window(
     PrecedingRangeWindowType preceding,
     FollowingRangeWindowType following,
     list requests,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -164,7 +165,8 @@ cpdef Table grouped_range_rolling_window(
     for req in requests:
         crequests.push_back(move((<RollingRequest?>req).view()))
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -176,10 +178,10 @@ cpdef Table grouped_range_rolling_window(
             dereference(preceding.c_obj.get()),
             dereference(following.c_obj.get()),
             crequests,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Table.from_libcudf(move(result), stream, mr)
+    return Table.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column rolling_window(
@@ -188,7 +190,7 @@ cpdef Column rolling_window(
     WindowType following_window,
     size_type min_periods,
     Aggregation agg,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a rolling window operation on a column
@@ -224,7 +226,8 @@ cpdef Column rolling_window(
     # reclaim the GIL internally for just the necessary scope like column.view()
     cdef const rolling_aggregation *c_agg = agg.view_underlying_as_rolling()
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if WindowType is Column:
@@ -235,7 +238,7 @@ cpdef Column rolling_window(
                 following_window.view(),
                 min_periods,
                 dereference(c_agg),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
@@ -246,11 +249,11 @@ cpdef Column rolling_window(
                 following_window,
                 min_periods,
                 dereference(c_agg),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef bool is_valid_rolling_aggregation(DataType source, Aggregation agg):
@@ -278,7 +281,7 @@ cpdef tuple make_range_windows(
     null_order null_order,
     PrecedingRangeWindowType preceding,
     FollowingRangeWindowType following,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -308,7 +311,8 @@ cpdef tuple make_range_windows(
     """
     cdef pair[unique_ptr[column], unique_ptr[column]] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -319,10 +323,10 @@ cpdef tuple make_range_windows(
             null_order,
             dereference(preceding.c_obj.get()),
             dereference(following.c_obj.get()),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
     return (
-        Column.from_libcudf(move(result.first), stream, mr),
-        Column.from_libcudf(move(result.second), stream, mr)
+        Column.from_libcudf(move(result.first), _stream, mr),
+        Column.from_libcudf(move(result.second), _stream, mr)
     )
diff --git a/python/pylibcudf/pylibcudf/round.pxd b/python/pylibcudf/pylibcudf/round.pxd
index ecd72c62c0a..0ac0c22346f 100644
--- a/python/pylibcudf/pylibcudf/round.pxd
+++ b/python/pylibcudf/pylibcudf/round.pxd
@@ -5,7 +5,6 @@ from pylibcudf.libcudf.round cimport rounding_method
 
 from .column cimport Column
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 
@@ -13,7 +12,7 @@ cpdef Column round(
     Column source,
     int32_t decimal_places = *,
     rounding_method round_method = *,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *
 )
 
@@ -21,6 +20,6 @@ cpdef Column round_decimal(
     Column source,
     int32_t decimal_places = *,
     rounding_method round_method = *,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *
 )
diff --git a/python/pylibcudf/pylibcudf/round.pyi b/python/pylibcudf/pylibcudf/round.pyi
index 848e43aeda7..30d08f234d5 100644
--- a/python/pylibcudf/pylibcudf/round.pyi
+++ b/python/pylibcudf/pylibcudf/round.pyi
@@ -4,9 +4,9 @@
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 class RoundingMethod(IntEnum):
     HALF_UP = ...
@@ -16,13 +16,13 @@ def round(
     source: Column,
     decimal_places: int = 0,
     round_method: RoundingMethod = RoundingMethod.HALF_UP,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def round_decimal(
     source: Column,
     decimal_places: int = 0,
     round_method: RoundingMethod = RoundingMethod.HALF_UP,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx
index 84a7ba6dbdf..f5baa6bbd23 100644
--- a/python/pylibcudf/pylibcudf/round.pyx
+++ b/python/pylibcudf/pylibcudf/round.pyx
@@ -19,6 +19,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .column cimport Column
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["RoundingMethod", "round"]
 
@@ -26,7 +27,7 @@ cpdef Column round(
     Column source,
     int32_t decimal_places = 0,
     rounding_method round_method = rounding_method.HALF_UP,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Rounds all the values in a column to the specified number of decimal places.
@@ -58,7 +59,8 @@ cpdef Column round(
         A Column with values rounded
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -66,18 +68,18 @@ cpdef Column round(
             source.view(),
             decimal_places,
             round_method,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column round_decimal(
     Column source,
     int32_t decimal_places = 0,
     rounding_method round_method = rounding_method.HALF_UP,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """Rounds all the values in a column to the specified number of decimal places.
@@ -106,7 +108,8 @@ cpdef Column round_decimal(
         A Column with values rounded
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -114,10 +117,10 @@ cpdef Column round_decimal(
             source.view(),
             decimal_places,
             round_method,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 RoundingMethod.__str__ = RoundingMethod.__repr__
diff --git a/python/pylibcudf/pylibcudf/scalar.pxd b/python/pylibcudf/pylibcudf/scalar.pxd
index 5230c0316be..b628b9185a6 100644
--- a/python/pylibcudf/pylibcudf/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/scalar.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
@@ -6,7 +6,6 @@ from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 from .column cimport Column
 from .types cimport DataType
@@ -24,10 +23,10 @@ cdef class Scalar:
     cdef const scalar* get(self) noexcept nogil
 
     cpdef DataType type(self)
-    cpdef bool is_valid(self, Stream stream=*)
+    cpdef bool is_valid(self, object stream = *)
 
     @staticmethod
-    cdef Scalar empty_like(Column column, Stream stream, DeviceMemoryResource mr)
+    cdef Scalar empty_like(Column column, object stream, DeviceMemoryResource mr)
 
     @staticmethod
     cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*)
diff --git a/python/pylibcudf/pylibcudf/scalar.pyi b/python/pylibcudf/pylibcudf/scalar.pyi
index ef940d8c021..a204894afd8 100644
--- a/python/pylibcudf/pylibcudf/scalar.pyi
+++ b/python/pylibcudf/pylibcudf/scalar.pyi
@@ -3,11 +3,10 @@
 
 from typing import Any
 
-from rmm.pylibrmm.stream import Stream
-
 from pylibcudf._interop_helpers import ColumnMetadata
 from pylibcudf.column import Column
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 NpGeneric = type[Any]
 
@@ -16,31 +15,33 @@ PaScalar = type[Any]
 class Scalar:
     def __init__(self): ...
     def type(self) -> DataType: ...
-    def is_valid(self, stream: Stream) -> bool: ...
+    def is_valid(self, stream: CudaStreamLike) -> bool: ...
     @staticmethod
-    def empty_like(column: Column, stream: Stream | None = None) -> Scalar: ...
+    def empty_like(
+        column: Column, stream: CudaStreamLike | None = None
+    ) -> Scalar: ...
     def to_arrow(
         self,
         metadata: ColumnMetadata | str | None = None,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
     ) -> PaScalar: ...
     @staticmethod
     def from_arrow(
         pa_val: Any,
         dtype: DataType | None = None,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
     ) -> Scalar: ...
     @classmethod
     def from_py(
         cls,
         py_val: Any,
         dtype: DataType | None = None,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
     ) -> Scalar: ...
     @classmethod
     def from_numpy(
-        cls, np_val: NpGeneric, stream: Stream | None = None
+        cls, np_val: NpGeneric, stream: CudaStreamLike | None = None
     ) -> Scalar: ...
     def to_py(
-        self, stream: Stream | None = None
+        self, stream: CudaStreamLike | None = None
     ) -> None | int | float | str | bool: ...
diff --git a/python/pylibcudf/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx
index 8771b4a75fd..54e088787a5 100644
--- a/python/pylibcudf/pylibcudf/scalar.pyx
+++ b/python/pylibcudf/pylibcudf/scalar.pyx
@@ -57,6 +57,7 @@ from rmm.pylibrmm.memory_resource cimport (
     get_current_device_resource,
 )
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 from .column cimport Column
 from .traits cimport is_floating_point
@@ -151,10 +152,11 @@ cdef class Scalar:
         """The type of data in the column."""
         return self._data_type
 
-    cpdef bool is_valid(self, Stream stream = None):
+    cpdef bool is_valid(self, object stream = None):
         """True if the scalar is valid, false if not"""
-        stream = _get_stream(stream)
-        return self.get().is_valid(stream.view())
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
+        return self.get().is_valid(_cs)
 
     def to_arrow(
         self,
@@ -176,7 +178,9 @@ cdef class Scalar:
         """
         # Note that metadata for scalars is primarily important for preserving
         # information on nested types since names are otherwise irrelevant.
-        return Column.from_scalar(self, 1, stream).to_arrow(metadata=metadata)[0]
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
+        return Column.from_scalar(self, 1, _stream).to_arrow(metadata=metadata)[0]
 
     @staticmethod
     def from_arrow(
@@ -205,7 +209,7 @@ cdef class Scalar:
         return _from_arrow(pa_val, dtype, stream)
 
     @staticmethod
-    cdef Scalar empty_like(Column column, Stream stream, DeviceMemoryResource mr):
+    cdef Scalar empty_like(Column column, object stream, DeviceMemoryResource mr):
         """Construct a null scalar with the same type as column.
 
         Parameters
@@ -221,8 +225,10 @@ cdef class Scalar:
         -------
         New empty (null) scalar of the given type.
         """
+        cdef Stream _stream = <Stream>stream
+        cdef cudaStream_t _cs = _stream.view().value()
         return Scalar.from_libcudf(
-            move(make_empty_scalar_like(column.view(), stream.view(), mr.get_mr()))
+            move(make_empty_scalar_like(column.view(), _cs, mr.get_mr()))
         )
 
     @staticmethod
@@ -266,9 +272,10 @@ cdef class Scalar:
         Scalar
             New pylibcudf.Scalar
         """
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
-        return _from_py(py_val, dtype, stream, mr)
+        return _from_py(py_val, dtype, _stream, mr)
 
     @classmethod
     def from_numpy(
@@ -294,9 +301,10 @@ cdef class Scalar:
         Scalar
             New pylibcudf.Scalar
         """
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
-        return _from_numpy(np_val, stream, mr)
+        return _from_numpy(np_val, _stream, mr)
 
     def to_py(self, stream: Stream | None = None):
         """
@@ -312,39 +320,40 @@ cdef class Scalar:
         Python scalar
             A Python scalar associated with the type of the Scalar.
         """
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         if not self.is_valid(stream):
             return None
 
         cdef type_id tid = self.type().id()
         cdef const scalar* slr = self.c_obj.get()
         if tid == type_id.BOOL8:
-            return (<numeric_scalar[cbool]*>slr).value(stream.view())
+            return (<numeric_scalar[cbool]*>slr).value(_cs)
         elif tid == type_id.STRING:
-            return (<string_scalar*>slr).to_string(stream.view()).decode()
+            return (<string_scalar*>slr).to_string(_cs).decode()
         elif tid == type_id.FLOAT32:
-            return (<numeric_scalar[float]*>slr).value(stream.view())
+            return (<numeric_scalar[float]*>slr).value(_cs)
         elif tid == type_id.FLOAT64:
-            return (<numeric_scalar[double]*>slr).value(stream.view())
+            return (<numeric_scalar[double]*>slr).value(_cs)
         elif tid == type_id.INT8:
-            return (<numeric_scalar[int8_t]*>slr).value(stream.view())
+            return (<numeric_scalar[int8_t]*>slr).value(_cs)
         elif tid == type_id.INT16:
-            return (<numeric_scalar[int16_t]*>slr).value(stream.view())
+            return (<numeric_scalar[int16_t]*>slr).value(_cs)
         elif tid == type_id.INT32:
-            return (<numeric_scalar[int32_t]*>slr).value(stream.view())
+            return (<numeric_scalar[int32_t]*>slr).value(_cs)
         elif tid == type_id.INT64:
-            return (<numeric_scalar[int64_t]*>slr).value(stream.view())
+            return (<numeric_scalar[int64_t]*>slr).value(_cs)
         elif tid == type_id.UINT8:
-            return (<numeric_scalar[uint8_t]*>slr).value(stream.view())
+            return (<numeric_scalar[uint8_t]*>slr).value(_cs)
         elif tid == type_id.UINT16:
-            return (<numeric_scalar[uint16_t]*>slr).value(stream.view())
+            return (<numeric_scalar[uint16_t]*>slr).value(_cs)
         elif tid == type_id.UINT32:
-            return (<numeric_scalar[uint32_t]*>slr).value(stream.view())
+            return (<numeric_scalar[uint32_t]*>slr).value(_cs)
         elif tid == type_id.UINT64:
-            return (<numeric_scalar[uint64_t]*>slr).value(stream.view())
+            return (<numeric_scalar[uint64_t]*>slr).value(_cs)
         elif tid == type_id.DECIMAL128:
             return decimal.Decimal(
-                (<fixed_point_scalar[decimal128]*>slr).value(stream.view()).value()
+                (<fixed_point_scalar[decimal128]*>slr).value(_cs).value()
             ).scaleb(
                 (<fixed_point_scalar[decimal128]*>slr).type().scale()
             )
@@ -375,6 +384,8 @@ def _from_py(
 def _(
     py_val, dtype: DataType | None, stream: Stream, mr: DeviceMemoryResource
 ):
+    cdef Stream _stream = stream
+    cdef cudaStream_t _cs = _stream.view().value()
     cdef DataType c_dtype
     if dtype is None:
         raise ValueError("Must specify a dtype for a None value.")
@@ -382,7 +393,7 @@ def _(
         c_dtype = <DataType>dtype
     cdef unique_ptr[scalar] c_obj = make_default_constructed_scalar(
         c_dtype.c_obj,
-        stream.view(),
+        _cs,
         mr.get_mr()
     )
     return _new_scalar(move(c_obj), dtype)
@@ -402,6 +413,8 @@ def _(
 def _(
     py_val, dtype: DataType | None, stream: Stream, mr: DeviceMemoryResource
 ):
+    cdef Stream _stream = stream
+    cdef cudaStream_t _cs = _stream.view().value()
     cdef unique_ptr[scalar] c_obj
     cdef DataType c_dtype
     if dtype is None:
@@ -414,11 +427,11 @@ def _(
     if tid == type_id.FLOAT32:
         if abs(py_val) > numeric_limits[float].max():
             raise OverflowError(f"{py_val} out of range for FLOAT32 scalar")
-        c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
-        (<numeric_scalar[float]*>c_obj.get()).set_value(py_val, stream.view())
+        c_obj = make_numeric_scalar(c_dtype.c_obj, _cs, mr.get_mr())
+        (<numeric_scalar[float]*>c_obj.get()).set_value(py_val, _cs)
     elif tid == type_id.FLOAT64:
-        c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
-        (<numeric_scalar[double]*>c_obj.get()).set_value(py_val, stream.view())
+        c_obj = make_numeric_scalar(c_dtype.c_obj, _cs, mr.get_mr())
+        (<numeric_scalar[double]*>c_obj.get()).set_value(py_val, _cs)
     else:
         typ = c_dtype.id()
         raise TypeError(f"Cannot convert float to Scalar with dtype {typ.name}")
@@ -430,6 +443,8 @@ def _(
 def _(
     py_val, dtype: DataType | None, stream: Stream, mr: DeviceMemoryResource
 ):
+    cdef Stream _stream = stream
+    cdef cudaStream_t _cs = _stream.view().value()
     cdef unique_ptr[scalar] c_obj
     cdef DataType c_dtype
     cdef duration_ns c_duration_ns
@@ -440,7 +455,7 @@ def _(
     if dtype is None:
         c_dtype = dtype = DataType(type_id.INT64)
     elif is_floating_point(dtype):
-        return _from_py(float(py_val), dtype, stream, mr)
+        return _from_py(float(py_val), dtype, _stream, mr)
     else:
         c_dtype = <DataType>dtype
     cdef type_id tid = c_dtype.id()
@@ -450,80 +465,80 @@ def _(
             numeric_limits[int8_t].min() <= py_val <= numeric_limits[int8_t].max()
         ):
             raise OverflowError(f"{py_val} out of range for INT8 scalar")
-        c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
-        (<numeric_scalar[int8_t]*>c_obj.get()).set_value(py_val, stream.view())
+        c_obj = make_numeric_scalar(c_dtype.c_obj, _cs, mr.get_mr())
+        (<numeric_scalar[int8_t]*>c_obj.get()).set_value(py_val, _cs)
 
     elif tid == type_id.INT16:
         if not (
             numeric_limits[int16_t].min() <= py_val <= numeric_limits[int16_t].max()
         ):
             raise OverflowError(f"{py_val} out of range for INT16 scalar")
-        c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
-        (<numeric_scalar[int16_t]*>c_obj.get()).set_value(py_val, stream.view())
+        c_obj = make_numeric_scalar(c_dtype.c_obj, _cs, mr.get_mr())
+        (<numeric_scalar[int16_t]*>c_obj.get()).set_value(py_val, _cs)
 
     elif tid == type_id.INT32:
         if not (
             numeric_limits[int32_t].min() <= py_val <= numeric_limits[int32_t].max()
         ):
             raise OverflowError(f"{py_val} out of range for INT32 scalar")
-        c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
-        (<numeric_scalar[int32_t]*>c_obj.get()).set_value(py_val, stream.view())
+        c_obj = make_numeric_scalar(c_dtype.c_obj, _cs, mr.get_mr())
+        (<numeric_scalar[int32_t]*>c_obj.get()).set_value(py_val, _cs)
 
     elif tid == type_id.INT64:
         if not (
             numeric_limits[int64_t].min() <= py_val <= numeric_limits[int64_t].max()
         ):
             raise OverflowError(f"{py_val} out of range for INT64 scalar")
-        c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
-        (<numeric_scalar[int64_t]*>c_obj.get()).set_value(py_val, stream.view())
+        c_obj = make_numeric_scalar(c_dtype.c_obj, _cs, mr.get_mr())
+        (<numeric_scalar[int64_t]*>c_obj.get()).set_value(py_val, _cs)
 
     elif tid == type_id.UINT8:
         if py_val < 0:
             raise ValueError("Cannot assign negative value to UINT8 scalar")
         if py_val > numeric_limits[uint8_t].max():
             raise OverflowError(f"{py_val} out of range for UINT8 scalar")
-        c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
-        (<numeric_scalar[uint8_t]*>c_obj.get()).set_value(py_val, stream.view())
+        c_obj = make_numeric_scalar(c_dtype.c_obj, _cs, mr.get_mr())
+        (<numeric_scalar[uint8_t]*>c_obj.get()).set_value(py_val, _cs)
 
     elif tid == type_id.UINT16:
         if py_val < 0:
             raise ValueError("Cannot assign negative value to UINT16 scalar")
         if py_val > numeric_limits[uint16_t].max():
             raise OverflowError(f"{py_val} out of range for UINT16 scalar")
-        c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
-        (<numeric_scalar[uint16_t]*>c_obj.get()).set_value(py_val, stream.view())
+        c_obj = make_numeric_scalar(c_dtype.c_obj, _cs, mr.get_mr())
+        (<numeric_scalar[uint16_t]*>c_obj.get()).set_value(py_val, _cs)
 
     elif tid == type_id.UINT32:
         if py_val < 0:
             raise ValueError("Cannot assign negative value to UINT32 scalar")
         if py_val > numeric_limits[uint32_t].max():
             raise OverflowError(f"{py_val} out of range for UINT32 scalar")
-        c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
-        (<numeric_scalar[uint32_t]*>c_obj.get()).set_value(py_val, stream.view())
+        c_obj = make_numeric_scalar(c_dtype.c_obj, _cs, mr.get_mr())
+        (<numeric_scalar[uint32_t]*>c_obj.get()).set_value(py_val, _cs)
 
     elif tid == type_id.UINT64:
         if py_val < 0:
             raise ValueError("Cannot assign negative value to UINT64 scalar")
         if py_val > numeric_limits[uint64_t].max():
             raise OverflowError(f"{py_val} out of range for UINT64 scalar")
-        c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
-        (<numeric_scalar[uint64_t]*>c_obj.get()).set_value(py_val, stream.view())
+        c_obj = make_numeric_scalar(c_dtype.c_obj, _cs, mr.get_mr())
+        (<numeric_scalar[uint64_t]*>c_obj.get()).set_value(py_val, _cs)
 
     elif tid == type_id.BOOL8:
         if py_val not in (0, 1):
             raise ValueError(f"Cannot convert {py_val} to BOOL8 scalar")
-        c_obj = make_numeric_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
-        (<numeric_scalar[cbool]*>c_obj.get()).set_value(py_val != 0, stream.view())
+        c_obj = make_numeric_scalar(c_dtype.c_obj, _cs, mr.get_mr())
+        (<numeric_scalar[cbool]*>c_obj.get()).set_value(py_val != 0, _cs)
 
     elif tid == type_id.DURATION_NANOSECONDS:
         if py_val > numeric_limits[int64_t].max():
             raise OverflowError(
                 f"{py_val} nanoseconds out of range for INT64 limit."
             )
-        c_obj = make_duration_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_duration_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_ns = duration_ns(<int64_t>py_val)
         (<duration_scalar[duration_ns]*>c_obj.get()).set_value(
-            c_duration_ns, stream.view()
+            c_duration_ns, _cs
         )
 
     elif tid == type_id.DURATION_MICROSECONDS:
@@ -531,10 +546,10 @@ def _(
             raise OverflowError(
                 f"{py_val} microseconds out of range for INT64 limit."
             )
-        c_obj = make_duration_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_duration_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_us = duration_us(<int64_t>py_val)
         (<duration_scalar[duration_us]*>c_obj.get()).set_value(
-            c_duration_us, stream.view()
+            c_duration_us, _cs
         )
 
     elif tid == type_id.DURATION_MILLISECONDS:
@@ -542,10 +557,10 @@ def _(
             raise OverflowError(
                 f"{py_val} milliseconds out of range for INT64 limit."
             )
-        c_obj = make_duration_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_duration_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_ms = duration_ms(<int64_t>py_val)
         (<duration_scalar[duration_ms]*>c_obj.get()).set_value(
-            c_duration_ms, stream.view()
+            c_duration_ms, _cs
         )
 
     elif tid == type_id.DURATION_SECONDS:
@@ -553,10 +568,10 @@ def _(
             raise OverflowError(
                 f"{py_val} seconds out of range for INT64 limit."
             )
-        c_obj = make_duration_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_duration_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_s = duration_s(<int64_t>py_val)
         (<duration_scalar[duration_s]*>c_obj.get()).set_value(
-            c_duration_s, stream.view()
+            c_duration_s, _cs
         )
 
     elif tid == type_id.DURATION_DAYS:
@@ -564,10 +579,10 @@ def _(
             raise OverflowError(
                 f"{py_val} days out of range for INT32 limit."
             )
-        c_obj = make_duration_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_duration_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_D = duration_D(<int32_t>py_val)
         (<duration_scalar[duration_D]*>c_obj.get()).set_value(
-            c_duration_D, stream.view()
+            c_duration_D, _cs
         )
 
     else:
@@ -581,6 +596,8 @@ def _(
 def _(
     py_val, dtype: DataType | None, stream: Stream, mr: DeviceMemoryResource
 ):
+    cdef Stream _stream = stream
+    cdef cudaStream_t _cs = _stream.view().value()
     if dtype is None:
         dtype = DataType(type_id.BOOL8)
     elif dtype.id() != type_id.BOOL8:
@@ -591,10 +608,10 @@ def _(
 
     cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
         (<DataType>dtype).c_obj,
-        stream.view(),
+        _cs,
         mr.get_mr()
     )
-    (<numeric_scalar[cbool]*>c_obj.get()).set_value(py_val, stream.view())
+    (<numeric_scalar[cbool]*>c_obj.get()).set_value(py_val, _cs)
     return _new_scalar(move(c_obj), dtype)
 
 
@@ -602,6 +619,8 @@ def _(
 def _(
     py_val, dtype: DataType | None, stream: Stream, mr: DeviceMemoryResource
 ):
+    cdef Stream _stream = stream
+    cdef cudaStream_t _cs = _stream.view().value()
     if dtype is None:
         dtype = DataType(type_id.STRING)
     elif dtype.id() != type_id.STRING:
@@ -610,7 +629,7 @@ def _(
             f"Cannot convert str to Scalar with dtype {tid.name}"
         )
     cdef unique_ptr[scalar] c_obj = make_string_scalar(
-        py_val.encode(), stream.view(), mr.get_mr()
+        py_val.encode(), _cs, mr.get_mr()
     )
     return _new_scalar(move(c_obj), dtype)
 
@@ -619,6 +638,8 @@ def _(
 def _(
     py_val, dtype: DataType | None, stream: Stream, mr: DeviceMemoryResource
 ):
+    cdef Stream _stream = stream
+    cdef cudaStream_t _cs = _stream.view().value()
     cdef unique_ptr[scalar] c_obj
     cdef duration_us c_duration_us
     cdef duration_ns c_duration_ns
@@ -637,10 +658,10 @@ def _(
             raise OverflowError(
                 f"{total_nanoseconds} nanoseconds out of range for INT64 limit."
             )
-        c_obj = make_duration_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_duration_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_ns = duration_ns(<int64_t>total_nanoseconds)
         (<duration_scalar[duration_ns]*>c_obj.get()).set_value(
-            c_duration_ns, stream.view()
+            c_duration_ns, _cs
         )
     elif tid == type_id.DURATION_MICROSECONDS:
         total_microseconds = int(total_seconds * 1_000_000)
@@ -648,10 +669,10 @@ def _(
             raise OverflowError(
                 f"{total_microseconds} microseconds out of range for INT64 limit."
             )
-        c_obj = make_duration_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_duration_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_us = duration_us(<int64_t>total_microseconds)
         (<duration_scalar[duration_us]*>c_obj.get()).set_value(
-            c_duration_us, stream.view()
+            c_duration_us, _cs
         )
     elif tid == type_id.DURATION_MILLISECONDS:
         total_milliseconds = int(total_seconds * 1_000)
@@ -659,10 +680,10 @@ def _(
             raise OverflowError(
                 f"{total_milliseconds} milliseconds out of range for INT64 limit."
             )
-        c_obj = make_duration_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_duration_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_ms = duration_ms(<int64_t>total_milliseconds)
         (<duration_scalar[duration_ms]*>c_obj.get()).set_value(
-            c_duration_ms, stream.view()
+            c_duration_ms, _cs
         )
     elif tid == type_id.DURATION_SECONDS:
         total_seconds = int(total_seconds)
@@ -670,10 +691,10 @@ def _(
             raise OverflowError(
                 f"{total_seconds} seconds out of range for INT64 limit."
             )
-        c_obj = make_duration_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_duration_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_s = duration_s(<int64_t>total_seconds)
         (<duration_scalar[duration_s]*>c_obj.get()).set_value(
-            c_duration_s, stream.view()
+            c_duration_s, _cs
         )
     elif tid == type_id.DURATION_DAYS:
         total_days = int(total_seconds // 86400)
@@ -681,10 +702,10 @@ def _(
             raise OverflowError(
                 f"{total_days} days out of range for INT32 limit."
             )
-        c_obj = make_duration_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_duration_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_D = duration_D(<int32_t>total_days)
         (<duration_scalar[duration_D]*>c_obj.get()).set_value(
-            c_duration_D, stream.view()
+            c_duration_D, _cs
         )
     else:
         typ = c_dtype.id()
@@ -696,6 +717,8 @@ def _(
 def _(
     py_val, dtype: DataType | None, stream: Stream, mr: DeviceMemoryResource
 ):
+    cdef Stream _stream = stream
+    cdef cudaStream_t _cs = _stream.view().value()
     cdef unique_ptr[scalar] c_obj
     cdef duration_us c_duration_us
     cdef duration_ns c_duration_ns
@@ -727,11 +750,11 @@ def _(
             raise OverflowError(
                 f"{epoch_nanoseconds} nanoseconds out of range for INT64 limit."
             )
-        c_obj = make_timestamp_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_timestamp_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_ns = duration_ns(<int64_t>epoch_nanoseconds)
         c_timestamp_ns = timestamp_ns(c_duration_ns)
         (<timestamp_scalar[timestamp_ns]*>c_obj.get()).set_value(
-            c_timestamp_ns, stream.view()
+            c_timestamp_ns, _cs
         )
     elif tid == type_id.TIMESTAMP_MICROSECONDS:
         epoch_microseconds = int(epoch_seconds * 1_000_000)
@@ -739,11 +762,11 @@ def _(
             raise OverflowError(
                 f"{epoch_microseconds} microseconds out of range for INT64 limit."
             )
-        c_obj = make_timestamp_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_timestamp_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_us = duration_us(<int64_t>epoch_microseconds)
         c_timestamp_us = timestamp_us(c_duration_us)
         (<timestamp_scalar[timestamp_us]*>c_obj.get()).set_value(
-            c_timestamp_us, stream.view()
+            c_timestamp_us, _cs
         )
     elif tid == type_id.TIMESTAMP_MILLISECONDS:
         epoch_milliseconds = int(epoch_seconds * 1_000)
@@ -751,11 +774,11 @@ def _(
             raise OverflowError(
                 f"{epoch_milliseconds} milliseconds out of range for INT64 limit."
             )
-        c_obj = make_timestamp_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_timestamp_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_ms = duration_ms(<int64_t>epoch_milliseconds)
         c_timestamp_ms = timestamp_ms(c_duration_ms)
         (<timestamp_scalar[timestamp_ms]*>c_obj.get()).set_value(
-            c_timestamp_ms, stream.view()
+            c_timestamp_ms, _cs
         )
     elif tid == type_id.TIMESTAMP_SECONDS:
         epoch_seconds = int(epoch_seconds)
@@ -763,11 +786,11 @@ def _(
             raise OverflowError(
                 f"{epoch_seconds} seconds out of range for INT64 limit."
             )
-        c_obj = make_timestamp_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_timestamp_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_s = duration_s(<int64_t>epoch_seconds)
         c_timestamp_s = timestamp_s(c_duration_s)
         (<timestamp_scalar[timestamp_s]*>c_obj.get()).set_value(
-            c_timestamp_s, stream.view()
+            c_timestamp_s, _cs
         )
     elif tid == type_id.TIMESTAMP_DAYS:
         epoch_days = int(epoch_seconds // 86400)
@@ -775,11 +798,11 @@ def _(
             raise OverflowError(
                 f"{epoch_days} days out of range for INT32 limit."
             )
-        c_obj = make_timestamp_scalar(c_dtype.c_obj, stream.view(), mr.get_mr())
+        c_obj = make_timestamp_scalar(c_dtype.c_obj, _cs, mr.get_mr())
         c_duration_D = duration_D(<int32_t>epoch_days)
         c_timestamp_D = timestamp_D(c_duration_D)
         (<timestamp_scalar[timestamp_D]*>c_obj.get()).set_value(
-            c_timestamp_D, stream.view()
+            c_timestamp_D, _cs
         )
     else:
         typ = c_dtype.id()
@@ -791,6 +814,8 @@ def _(
 def _(
     py_val, dtype: DataType | None, stream: Stream, mr: DeviceMemoryResource
 ):
+    cdef Stream _stream = stream
+    cdef cudaStream_t _cs = _stream.view().value()
     scale = py_val.as_tuple().exponent
     as_int = int(py_val.scaleb(-scale))
 
@@ -804,7 +829,7 @@ def _(
     cdef unique_ptr[scalar] c_obj = make_fixed_point_scalar[decimal128](
         val,
         scale_type(<int32_t>scale),
-        stream.view(),
+        _cs,
         mr.get_mr()
     )
     return _new_scalar(move(c_obj), dtype)
@@ -829,21 +854,25 @@ if np is not None:
 
     @_from_numpy.register(np.bool_)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         cdef DataType dtype = DataType(type_id.BOOL8)
         cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
-            dtype.c_obj, stream.view(), mr.get_mr()
+            dtype.c_obj, _cs, mr.get_mr()
         )
         cdef cbool c_val = np_val
-        (<numeric_scalar[cbool]*>c_obj.get()).set_value(c_val, stream.view())
+        (<numeric_scalar[cbool]*>c_obj.get()).set_value(c_val, _cs)
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
         return slr
 
     @_from_numpy.register(np.str_)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         cdef DataType dtype = DataType(type_id.STRING)
         cdef unique_ptr[scalar] c_obj = make_string_scalar(
             np_val.item().encode(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
@@ -851,101 +880,121 @@ if np is not None:
 
     @_from_numpy.register(np.int8)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         dtype = DataType(type_id.INT8)
         cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
-            dtype.c_obj, stream.view(), mr.get_mr()
+            dtype.c_obj, _cs, mr.get_mr()
         )
-        (<numeric_scalar[int8_t]*>c_obj.get()).set_value(np_val, stream.view())
+        (<numeric_scalar[int8_t]*>c_obj.get()).set_value(np_val, _cs)
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
         return slr
 
     @_from_numpy.register(np.int16)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         dtype = DataType(type_id.INT16)
         cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
-            dtype.c_obj, stream.view(), mr.get_mr()
+            dtype.c_obj, _cs, mr.get_mr()
         )
-        (<numeric_scalar[int16_t]*>c_obj.get()).set_value(np_val, stream.view())
+        (<numeric_scalar[int16_t]*>c_obj.get()).set_value(np_val, _cs)
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
         return slr
 
     @_from_numpy.register(np.int32)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         dtype = DataType(type_id.INT32)
         cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
-            dtype.c_obj, stream.view(), mr.get_mr()
+            dtype.c_obj, _cs, mr.get_mr()
         )
-        (<numeric_scalar[int32_t]*>c_obj.get()).set_value(np_val, stream.view())
+        (<numeric_scalar[int32_t]*>c_obj.get()).set_value(np_val, _cs)
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
         return slr
 
     @_from_numpy.register(np.int64)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         dtype = DataType(type_id.INT64)
         cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
-            dtype.c_obj, stream.view(), mr.get_mr()
+            dtype.c_obj, _cs, mr.get_mr()
         )
-        (<numeric_scalar[int64_t]*>c_obj.get()).set_value(np_val, stream.view())
+        (<numeric_scalar[int64_t]*>c_obj.get()).set_value(np_val, _cs)
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
         return slr
 
     @_from_numpy.register(np.uint8)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         dtype = DataType(type_id.UINT8)
         cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
-            dtype.c_obj, stream.view(), mr.get_mr()
+            dtype.c_obj, _cs, mr.get_mr()
         )
-        (<numeric_scalar[uint8_t]*>c_obj.get()).set_value(np_val, stream.view())
+        (<numeric_scalar[uint8_t]*>c_obj.get()).set_value(np_val, _cs)
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
         return slr
 
     @_from_numpy.register(np.uint16)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         dtype = DataType(type_id.UINT16)
         cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
-            dtype.c_obj, stream.view(), mr.get_mr()
+            dtype.c_obj, _cs, mr.get_mr()
         )
-        (<numeric_scalar[uint16_t]*>c_obj.get()).set_value(np_val, stream.view())
+        (<numeric_scalar[uint16_t]*>c_obj.get()).set_value(np_val, _cs)
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
         return slr
 
     @_from_numpy.register(np.uint32)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         dtype = DataType(type_id.UINT32)
         cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
-            dtype.c_obj, stream.view(), mr.get_mr()
+            dtype.c_obj, _cs, mr.get_mr()
         )
-        (<numeric_scalar[uint32_t]*>c_obj.get()).set_value(np_val, stream.view())
+        (<numeric_scalar[uint32_t]*>c_obj.get()).set_value(np_val, _cs)
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
         return slr
 
     @_from_numpy.register(np.uint64)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         dtype = DataType(type_id.UINT64)
         cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
-            dtype.c_obj, stream.view(), mr.get_mr()
+            dtype.c_obj, _cs, mr.get_mr()
         )
-        (<numeric_scalar[uint64_t]*>c_obj.get()).set_value(np_val, stream.view())
+        (<numeric_scalar[uint64_t]*>c_obj.get()).set_value(np_val, _cs)
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
         return slr
 
     @_from_numpy.register(np.float32)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         dtype = DataType(type_id.FLOAT32)
         cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
-            dtype.c_obj, stream.view(), mr.get_mr()
+            dtype.c_obj, _cs, mr.get_mr()
         )
-        (<numeric_scalar[float]*>c_obj.get()).set_value(np_val, stream.view())
+        (<numeric_scalar[float]*>c_obj.get()).set_value(np_val, _cs)
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
         return slr
 
     @_from_numpy.register(np.float64)
     def _(np_val, stream: Stream, mr: DeviceMemoryResource):
+        cdef Stream _stream = stream
+        cdef cudaStream_t _cs = _stream.view().value()
         dtype = DataType(type_id.FLOAT64)
         cdef unique_ptr[scalar] c_obj = make_numeric_scalar(
-            dtype.c_obj, stream.view(), mr.get_mr()
+            dtype.c_obj, _cs, mr.get_mr()
         )
-        (<numeric_scalar[double]*>c_obj.get()).set_value(np_val, stream.view())
+        (<numeric_scalar[double]*>c_obj.get()).set_value(np_val, _cs)
         cdef Scalar slr = _new_scalar(move(c_obj), dtype)
         return slr
 
diff --git a/python/pylibcudf/pylibcudf/search.pxd b/python/pylibcudf/pylibcudf/search.pxd
index 7b0725bf60b..c26a6689240 100644
--- a/python/pylibcudf/pylibcudf/search.pxd
+++ b/python/pylibcudf/pylibcudf/search.pxd
@@ -1,7 +1,6 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .column cimport Column
@@ -13,7 +12,7 @@ cpdef Column lower_bound(
     Table needles,
     list column_order,
     list null_precedence,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -22,10 +21,10 @@ cpdef Column upper_bound(
     Table needles,
     list column_order,
     list null_precedence,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef Column contains(
-    Column haystack, Column needles, Stream stream = *, DeviceMemoryResource mr = *
+    Column haystack, Column needles, object stream = *, DeviceMemoryResource mr = *
 )
diff --git a/python/pylibcudf/pylibcudf/search.pyi b/python/pylibcudf/pylibcudf/search.pyi
index eaec283a32a..6cc58946f56 100644
--- a/python/pylibcudf/pylibcudf/search.pyi
+++ b/python/pylibcudf/pylibcudf/search.pyi
@@ -1,19 +1,19 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.table import Table
 from pylibcudf.types import NullOrder, Order
+from pylibcudf.utils import CudaStreamLike
 
 def lower_bound(
     haystack: Table,
     needles: Table,
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def upper_bound(
@@ -21,12 +21,12 @@ def upper_bound(
     needles: Table,
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def contains(
     haystack: Column,
     needles: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx
index 4915b1b8be9..885d25f2d49 100644
--- a/python/pylibcudf/pylibcudf/search.pyx
+++ b/python/pylibcudf/pylibcudf/search.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -13,6 +13,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from .column cimport Column
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["contains", "lower_bound", "upper_bound"]
 
@@ -21,7 +22,7 @@ cpdef Column lower_bound(
     Table needles,
     list column_order,
     list null_precedence,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Find smallest indices in haystack where needles may be inserted to retain order.
@@ -52,7 +53,8 @@ cpdef Column lower_bound(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -61,10 +63,10 @@ cpdef Column lower_bound(
             needles.view(),
             c_orders,
             c_null_precedence,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column upper_bound(
@@ -72,7 +74,7 @@ cpdef Column upper_bound(
     Table needles,
     list column_order,
     list null_precedence,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Find largest indices in haystack where needles may be inserted to retain order.
@@ -103,7 +105,8 @@ cpdef Column upper_bound(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -112,14 +115,14 @@ cpdef Column upper_bound(
             needles.view(),
             c_orders,
             c_null_precedence,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column contains(
-    Column haystack, Column needles, Stream stream=None, DeviceMemoryResource mr=None
+    Column haystack, Column needles, object stream=None, DeviceMemoryResource mr=None
 ):
     """Check whether needles are present in haystack.
 
@@ -143,14 +146,15 @@ cpdef Column contains(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_search.contains(
             haystack.view(),
             needles.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/sorting.pxd b/python/pylibcudf/pylibcudf/sorting.pxd
index 701b6803c34..a081ece747a 100644
--- a/python/pylibcudf/pylibcudf/sorting.pxd
+++ b/python/pylibcudf/pylibcudf/sorting.pxd
@@ -1,11 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
 from pylibcudf.libcudf.aggregation cimport rank_method
 from pylibcudf.libcudf.types cimport null_order, null_policy, order, size_type
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 from .column cimport Column
 from .table cimport Table
@@ -15,7 +14,7 @@ cpdef Column sorted_order(
     Table source_table,
     list column_order,
     list null_precedence,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -23,7 +22,7 @@ cpdef Column stable_sorted_order(
     Table source_table,
     list column_order,
     list null_precedence,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -34,12 +33,12 @@ cpdef Column rank(
     null_policy null_handling,
     null_order null_precedence,
     bool percentage,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef bool is_sorted(
-    Table table, list column_order, list null_precedence, Stream stream=*
+    Table table, list column_order, list null_precedence, object stream = *
 )
 
 cpdef Table segmented_sort_by_key(
@@ -48,7 +47,7 @@ cpdef Table segmented_sort_by_key(
     Column segment_offsets,
     list column_order,
     list null_precedence,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -58,7 +57,7 @@ cpdef Table stable_segmented_sort_by_key(
     Column segment_offsets,
     list column_order,
     list null_precedence,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -67,7 +66,7 @@ cpdef Table sort_by_key(
     Table keys,
     list column_order,
     list null_precedence,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -76,7 +75,7 @@ cpdef Table stable_sort_by_key(
     Table keys,
     list column_order,
     list null_precedence,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -84,7 +83,7 @@ cpdef Table sort(
     Table source_table,
     list column_order,
     list null_precedence,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -92,7 +91,7 @@ cpdef Table stable_sort(
     Table source_table,
     list column_order,
     list null_precedence,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -100,7 +99,7 @@ cpdef Column top_k(
     Column col,
     size_type k,
     order sort_order=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -108,6 +107,6 @@ cpdef Column top_k_order(
     Column col,
     size_type k,
     order sort_order=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/sorting.pyi b/python/pylibcudf/pylibcudf/sorting.pyi
index 8f00fcade6e..a06586a8f39 100644
--- a/python/pylibcudf/pylibcudf/sorting.pyi
+++ b/python/pylibcudf/pylibcudf/sorting.pyi
@@ -1,26 +1,26 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.aggregation import RankMethod
 from pylibcudf.column import Column
 from pylibcudf.table import Table
 from pylibcudf.types import NullOrder, NullPolicy, Order
+from pylibcudf.utils import CudaStreamLike
 
 def sorted_order(
     source_table: Table,
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def stable_sorted_order(
     source_table: Table,
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def rank(
@@ -30,14 +30,14 @@ def rank(
     null_handling: NullPolicy,
     null_precedence: NullOrder,
     percentage: bool,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_sorted(
     tbl: Table,
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
 ) -> bool: ...
 def segmented_sort_by_key(
     values: Table,
@@ -45,7 +45,7 @@ def segmented_sort_by_key(
     segment_offsets: Column,
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def stable_segmented_sort_by_key(
@@ -54,7 +54,7 @@ def stable_segmented_sort_by_key(
     segment_offsets: Column,
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def sort_by_key(
@@ -62,7 +62,7 @@ def sort_by_key(
     keys: Table,
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def stable_sort_by_key(
@@ -70,34 +70,34 @@ def stable_sort_by_key(
     keys: Table,
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def sort(
     source_table: Table,
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def stable_sort(
     source_table: Table,
     column_order: list[Order],
     null_precedence: list[NullOrder],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def top_k(
     col: Column,
     k: int,
     sort_order: Order = Order.DESCENDING,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def top_k_order(
     col: Column,
     k: int,
     sort_order: Order = Order.DESCENDING,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx
index be668ff2526..fa0ed78b709 100644
--- a/python/pylibcudf/pylibcudf/sorting.pyx
+++ b/python/pylibcudf/pylibcudf/sorting.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -15,6 +15,7 @@ from rmm.pylibrmm.stream cimport Stream
 from .column cimport Column
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "is_sorted",
@@ -33,7 +34,7 @@ cpdef Column sorted_order(
     Table source_table,
     list column_order,
     list null_precedence,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the row indices required to sort the table.
@@ -58,7 +59,8 @@ cpdef Column sorted_order(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -66,17 +68,17 @@ cpdef Column sorted_order(
             source_table.view(),
             c_orders,
             c_null_precedence,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column stable_sorted_order(
     Table source_table,
     list column_order,
     list null_precedence,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the row indices required to sort the table,
@@ -102,7 +104,8 @@ cpdef Column stable_sorted_order(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -110,10 +113,10 @@ cpdef Column stable_sorted_order(
             source_table.view(),
             c_orders,
             c_null_precedence,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column rank(
@@ -123,7 +126,7 @@ cpdef Column rank(
     null_policy null_handling,
     null_order null_precedence,
     bool percentage,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Computes the rank of each element in the column.
@@ -152,7 +155,8 @@ cpdef Column rank(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -163,14 +167,14 @@ cpdef Column rank(
             null_handling,
             null_precedence,
             percentage,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef bool is_sorted(
-    Table tbl, list column_order, list null_precedence, Stream stream=None
+    Table tbl, list column_order, list null_precedence, object stream=None
 ):
     """Checks if the table is sorted.
 
@@ -194,14 +198,15 @@ cpdef bool is_sorted(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
 
     with nogil:
         c_result = cpp_sorting.is_sorted(
             tbl.view(),
             c_orders,
             c_null_precedence,
-            stream.view()
+            _cs
         )
     return c_result
 
@@ -212,7 +217,7 @@ cpdef Table segmented_sort_by_key(
     Column segment_offsets,
     list column_order,
     list null_precedence,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Sorts the table by key, within segments.
@@ -241,7 +246,8 @@ cpdef Table segmented_sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -251,10 +257,10 @@ cpdef Table segmented_sort_by_key(
             segment_offsets.view(),
             c_orders,
             c_null_precedence,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table stable_segmented_sort_by_key(
@@ -263,7 +269,7 @@ cpdef Table stable_segmented_sort_by_key(
     Column segment_offsets,
     list column_order,
     list null_precedence,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Sorts the table by key preserving order of equal elements,
@@ -293,7 +299,8 @@ cpdef Table stable_segmented_sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -303,10 +310,10 @@ cpdef Table stable_segmented_sort_by_key(
             segment_offsets.view(),
             c_orders,
             c_null_precedence,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table sort_by_key(
@@ -314,7 +321,7 @@ cpdef Table sort_by_key(
     Table keys,
     list column_order,
     list null_precedence,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Sorts the table by key.
@@ -341,7 +348,8 @@ cpdef Table sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -350,10 +358,10 @@ cpdef Table sort_by_key(
             keys.view(),
             c_orders,
             c_null_precedence,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table stable_sort_by_key(
@@ -361,7 +369,7 @@ cpdef Table stable_sort_by_key(
     Table keys,
     list column_order,
     list null_precedence,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Sorts the table by key preserving order of equal elements.
@@ -388,7 +396,8 @@ cpdef Table stable_sort_by_key(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -397,17 +406,17 @@ cpdef Table stable_sort_by_key(
             keys.view(),
             c_orders,
             c_null_precedence,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table sort(
     Table source_table,
     list column_order,
     list null_precedence,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Sorts the table.
@@ -432,7 +441,8 @@ cpdef Table sort(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -440,17 +450,17 @@ cpdef Table sort(
             source_table.view(),
             c_orders,
             c_null_precedence,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table stable_sort(
     Table source_table,
     list column_order,
     list null_precedence,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Sorts the table preserving order of equal elements.
@@ -475,7 +485,8 @@ cpdef Table stable_sort(
     cdef vector[order] c_orders = column_order
     cdef vector[null_order] c_null_precedence = null_precedence
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -483,17 +494,17 @@ cpdef Table stable_sort(
             source_table.view(),
             c_orders,
             c_null_precedence,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column top_k(
     Column col,
     size_type k,
     order sort_order = order.DESCENDING,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -518,7 +529,8 @@ cpdef Column top_k(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -526,17 +538,17 @@ cpdef Column top_k(
             col.view(),
             k,
             sort_order,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column top_k_order(
     Column col,
     size_type k,
     order sort_order = order.DESCENDING,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -564,7 +576,8 @@ cpdef Column top_k_order(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -572,7 +585,7 @@ cpdef Column top_k_order(
             col.view(),
             k,
             sort_order,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/stream_compaction.pxd
index 03b463f5f3a..6e904e11ce1 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pxd
@@ -8,7 +8,6 @@ from pylibcudf.libcudf.types cimport (
     size_type,
 )
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 from .column cimport Column
 from .expressions cimport Expression
@@ -19,7 +18,7 @@ cpdef Table drop_nulls(
     Table source_table,
     list keys,
     size_type keep_threshold,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -27,14 +26,14 @@ cpdef Table drop_nans(
     Table source_table,
     list keys,
     size_type keep_threshold,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef Table apply_boolean_mask(
     Table source_table,
     Column boolean_mask,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -43,7 +42,7 @@ cpdef Table unique(
     list keys,
     duplicate_keep_option keep,
     null_equality nulls_equal,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -53,7 +52,7 @@ cpdef Table distinct(
     duplicate_keep_option keep,
     null_equality nulls_equal,
     nan_equality nans_equal,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -62,7 +61,7 @@ cpdef Column distinct_indices(
     duplicate_keep_option keep,
     null_equality nulls_equal,
     nan_equality nans_equal,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -72,7 +71,7 @@ cpdef Table stable_distinct(
     duplicate_keep_option keep,
     null_equality nulls_equal,
     nan_equality nans_equal,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -80,6 +79,6 @@ cpdef Table filter(
     Table predicate_table,
     Expression predicate_expr,
     Table filter_table,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyi b/python/pylibcudf/pylibcudf/stream_compaction.pyi
index 49c44f82486..afdd692dde2 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pyi
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyi
@@ -4,12 +4,12 @@
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.expressions import Expression
 from pylibcudf.table import Table
 from pylibcudf.types import NanEquality, NullEquality
+from pylibcudf.utils import CudaStreamLike
 
 class DuplicateKeepOption(IntEnum):
     KEEP_ANY = ...
@@ -21,20 +21,20 @@ def drop_nulls(
     source_table: Table,
     keys: list[int],
     keep_threshold: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def drop_nans(
     source_table: Table,
     keys: list[int],
     keep_threshold: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def apply_boolean_mask(
     source_table: Table,
     boolean_mask: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def unique(
@@ -42,7 +42,7 @@ def unique(
     keys: list[int],
     keep: DuplicateKeepOption,
     nulls_equal: NullEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def distinct(
@@ -51,7 +51,7 @@ def distinct(
     keep: DuplicateKeepOption,
     nulls_equal: NullEquality,
     nans_equal: NanEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def distinct_indices(
@@ -59,7 +59,7 @@ def distinct_indices(
     keep: DuplicateKeepOption,
     nulls_equal: NullEquality,
     nans_equal: NanEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def stable_distinct(
@@ -68,13 +68,13 @@ def stable_distinct(
     keep: DuplicateKeepOption,
     nulls_equal: NullEquality,
     nans_equal: NanEquality,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def filter(
     predicate_table: Table,
     predicate_expr: Expression,
     filter_table: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx
index 4e676602cf8..b4751078acb 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pyx
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx
@@ -24,6 +24,7 @@ from .column cimport Column
 from .expressions cimport Expression
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "DuplicateKeepOption",
@@ -41,7 +42,7 @@ cpdef Table drop_nulls(
     Table source_table,
     list keys,
     size_type keep_threshold,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Filters out rows from the input table based on the presence of nulls.
@@ -65,21 +66,22 @@ cpdef Table drop_nulls(
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_stream_compaction.drop_nulls(
-            source_table.view(), c_keys, keep_threshold, stream.view(), mr.get_mr()
+            source_table.view(), c_keys, keep_threshold, _cs, mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table drop_nans(
     Table source_table,
     list keys,
     size_type keep_threshold,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Filters out rows from the input table based on the presence of NaNs.
@@ -103,20 +105,21 @@ cpdef Table drop_nans(
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_stream_compaction.drop_nans(
-            source_table.view(), c_keys, keep_threshold, stream.view(), mr.get_mr()
+            source_table.view(), c_keys, keep_threshold, _cs, mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table apply_boolean_mask(
     Table source_table,
     Column boolean_mask,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Filters out rows from the input table based on a boolean mask.
@@ -137,14 +140,15 @@ cpdef Table apply_boolean_mask(
     """
     cdef unique_ptr[table] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_stream_compaction.apply_boolean_mask(
-            source_table.view(), boolean_mask.view(), stream.view(), mr.get_mr()
+            source_table.view(), boolean_mask.view(), _cs, mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table unique(
@@ -152,7 +156,7 @@ cpdef Table unique(
     list keys,
     duplicate_keep_option keep,
     null_equality nulls_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Filter duplicate consecutive rows from the input table.
@@ -184,14 +188,15 @@ cpdef Table unique(
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_stream_compaction.unique(
-            input.view(), c_keys, keep, nulls_equal, stream.view(), mr.get_mr()
+            input.view(), c_keys, keep, nulls_equal, _cs, mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table distinct(
@@ -200,7 +205,7 @@ cpdef Table distinct(
     duplicate_keep_option keep,
     null_equality nulls_equal,
     nan_equality nans_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Get the distinct rows from the input table.
@@ -229,15 +234,16 @@ cpdef Table distinct(
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_stream_compaction.distinct(
-            input.view(), c_keys, keep, nulls_equal, nans_equal, stream.view(),
+            input.view(), c_keys, keep, nulls_equal, nans_equal, _cs,
             mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column distinct_indices(
@@ -245,7 +251,7 @@ cpdef Column distinct_indices(
     duplicate_keep_option keep,
     null_equality nulls_equal,
     nan_equality nans_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Get the indices of the distinct rows from the input table.
@@ -270,14 +276,15 @@ cpdef Column distinct_indices(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_stream_compaction.distinct_indices(
-            input.view(), keep, nulls_equal, nans_equal, stream.view(), mr.get_mr()
+            input.view(), keep, nulls_equal, nans_equal, _cs, mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table stable_distinct(
@@ -286,7 +293,7 @@ cpdef Table stable_distinct(
     duplicate_keep_option keep,
     null_equality nulls_equal,
     nan_equality nans_equal,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Get the distinct rows from the input table, preserving input order.
@@ -315,22 +322,23 @@ cpdef Table stable_distinct(
     cdef unique_ptr[table] c_result
     cdef vector[size_type] c_keys = keys
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_stream_compaction.stable_distinct(
-            input.view(), c_keys, keep, nulls_equal, nans_equal, stream.view(),
+            input.view(), c_keys, keep, nulls_equal, nans_equal, _cs,
             mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table filter(
     Table predicate_table,
     Expression predicate_expr,
     Table filter_table,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Filters a table using a predicate expression.
@@ -353,7 +361,8 @@ cpdef Table filter(
     """
     cdef unique_ptr[table] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -361,9 +370,9 @@ cpdef Table filter(
             predicate_table.view(),
             dereference(predicate_expr.c_obj.get()),
             filter_table.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 DuplicateKeepOption.__str__ = DuplicateKeepOption.__repr__
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pxd b/python/pylibcudf/pylibcudf/strings/attributes.pxd
index 68b1ce9b5a0..64533b1ce3d 100644
--- a/python/pylibcudf/pylibcudf/strings/attributes.pxd
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pxd
@@ -1,19 +1,18 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column count_characters(
-    Column source_strings, Stream stream=*, DeviceMemoryResource mr=*
+    Column source_strings, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column count_bytes(
-    Column source_strings, Stream stream=*, DeviceMemoryResource mr=*
+    Column source_strings, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column code_points(
-    Column source_strings, Stream stream=*, DeviceMemoryResource mr=*
+    Column source_strings, object stream = *, DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyi b/python/pylibcudf/pylibcudf/strings/attributes.pyi
index 06b76e669d3..2e28fb9f186 100644
--- a/python/pylibcudf/pylibcudf/strings/attributes.pyi
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pyi
@@ -1,23 +1,23 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def count_characters(
     source_strings: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def count_bytes(
     source_strings: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def code_points(
     source_strings: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx
index 2449d51122f..334270ea834 100644
--- a/python/pylibcudf/pylibcudf/strings/attributes.pyx
+++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -9,11 +9,12 @@ from pylibcudf.libcudf.strings cimport attributes as cpp_attributes
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["code_points", "count_bytes", "count_characters"]
 
 cpdef Column count_characters(
-    Column source_strings, Stream stream=None, DeviceMemoryResource mr=None
+    Column source_strings, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a column containing character lengths of each string
@@ -32,19 +33,20 @@ cpdef Column count_characters(
         New column with lengths for each string
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_attributes.count_characters(
-            source_strings.view(), stream.view(), mr.get_mr()
+            source_strings.view(), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column count_bytes(
-    Column source_strings, Stream stream=None, DeviceMemoryResource mr=None
+    Column source_strings, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a column containing byte lengths of each string
@@ -63,19 +65,20 @@ cpdef Column count_bytes(
         New column with the number of bytes for each string
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_attributes.count_bytes(
-            source_strings.view(), stream.view(), mr.get_mr()
+            source_strings.view(), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column code_points(
-    Column source_strings, Stream stream=None, DeviceMemoryResource mr=None
+    Column source_strings, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Creates a numeric column with code point values (integers)
@@ -94,12 +97,13 @@ cpdef Column code_points(
         New column with code point integer values for each character
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_attributes.code_points(
-            source_strings.view(), stream.view(), mr.get_mr()
+            source_strings.view(), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pxd b/python/pylibcudf/pylibcudf/strings/capitalize.pxd
index ccbe15b3794..1a68c29e05c 100644
--- a/python/pylibcudf/pylibcudf/strings/capitalize.pxd
+++ b/python/pylibcudf/pylibcudf/strings/capitalize.pxd
@@ -1,20 +1,19 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.scalar cimport Scalar
 from pylibcudf.libcudf.strings.char_types cimport string_character_types
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column capitalize(
-    Column input, Scalar delimiters=*, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, Scalar delimiters=*, object stream = *, DeviceMemoryResource mr=*
 )
 cpdef Column title(
     Column input,
     string_character_types sequence_type=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
-cpdef Column is_title(Column input, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef Column is_title(Column input, object stream = *, DeviceMemoryResource mr=*)
diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyi b/python/pylibcudf/pylibcudf/strings/capitalize.pyi
index 35554e6fff3..031d244bf25 100644
--- a/python/pylibcudf/pylibcudf/strings/capitalize.pyi
+++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyi
@@ -1,27 +1,27 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.strings.char_types import StringCharacterTypes
+from pylibcudf.utils import CudaStreamLike
 
 def capitalize(
     input: Column,
     delimiters: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def title(
     input: Column,
     sequence_type: StringCharacterTypes = StringCharacterTypes.ALPHA,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_title(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/capitalize.pyx b/python/pylibcudf/pylibcudf/strings/capitalize.pyx
index 11291bd1243..be8c52a59b5 100644
--- a/python/pylibcudf/pylibcudf/strings/capitalize.pyx
+++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -17,13 +17,14 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from cython.operator import dereference
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["capitalize", "is_title", "title"]
 
 cpdef Column capitalize(
     Column input,
     Scalar delimiters=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
     # TODO: default scalar values
     # https://github.com/rapidsai/cudf/issues/15505
@@ -45,12 +46,13 @@ cpdef Column capitalize(
         Column of strings capitalized from the input column
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if delimiters is None:
         delimiters = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
 
     cdef const string_scalar* cpp_delimiters = <const string_scalar*>(
@@ -61,17 +63,17 @@ cpdef Column capitalize(
         c_result = cpp_capitalize.capitalize(
             input.view(),
             dereference(cpp_delimiters),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column title(
     Column input,
     string_character_types sequence_type=string_character_types.ALPHA,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Modifies first character of each word to upper-case and lower-cases
@@ -92,17 +94,18 @@ cpdef Column title(
         Column of titled strings
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
         c_result = cpp_capitalize.title(
-            input.view(), sequence_type, stream.view(), mr.get_mr()
+            input.view(), sequence_type, _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
-cpdef Column is_title(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column is_title(Column input, object stream=None, DeviceMemoryResource mr=None):
     """Checks if the strings in the input column are title formatted.
 
     For details, see :cpp:func:`is_title`.
@@ -118,9 +121,10 @@ cpdef Column is_title(Column input, Stream stream=None, DeviceMemoryResource mr=
         Column of type BOOL8
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
-        c_result = cpp_capitalize.is_title(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_capitalize.is_title(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/case.pxd b/python/pylibcudf/pylibcudf/strings/case.pxd
index 8a959fb61d5..fea9f68e95e 100644
--- a/python/pylibcudf/pylibcudf/strings/case.pxd
+++ b/python/pylibcudf/pylibcudf/strings/case.pxd
@@ -1,11 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
-cpdef Column to_lower(Column input, Stream stream=*, DeviceMemoryResource mr=*)
-cpdef Column to_upper(Column input, Stream stream=*, DeviceMemoryResource mr=*)
-cpdef Column swapcase(Column input, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef Column to_lower(Column input, object stream = *, DeviceMemoryResource mr=*)
+cpdef Column to_upper(Column input, object stream = *, DeviceMemoryResource mr=*)
+cpdef Column swapcase(Column input, object stream = *, DeviceMemoryResource mr=*)
diff --git a/python/pylibcudf/pylibcudf/strings/case.pyi b/python/pylibcudf/pylibcudf/strings/case.pyi
index ecdb614fcd7..1337e7df5a9 100644
--- a/python/pylibcudf/pylibcudf/strings/case.pyi
+++ b/python/pylibcudf/pylibcudf/strings/case.pyi
@@ -1,23 +1,23 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def to_lower(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def to_upper(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def swapcase(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/case.pyx b/python/pylibcudf/pylibcudf/strings/case.pyx
index 5e7d20f01f8..ec6539f42e1 100644
--- a/python/pylibcudf/pylibcudf/strings/case.pyx
+++ b/python/pylibcudf/pylibcudf/strings/case.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -9,10 +9,11 @@ from pylibcudf.libcudf.strings cimport case as cpp_case
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["swapcase", "to_lower", "to_upper"]
 
-cpdef Column to_lower(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column to_lower(Column input, object stream=None, DeviceMemoryResource mr=None):
     """Returns a column of lowercased strings.
 
     For details, see :cpp:func:`to_lower`.
@@ -32,14 +33,15 @@ cpdef Column to_lower(Column input, Stream stream=None, DeviceMemoryResource mr=
         Column of strings lowercased from the input column
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
-        c_result = cpp_case.to_lower(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_case.to_lower(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
-cpdef Column to_upper(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column to_upper(Column input, object stream=None, DeviceMemoryResource mr=None):
     """Returns a column of uppercased strings.
 
     For details, see :cpp:func:`to_upper`.
@@ -59,14 +61,15 @@ cpdef Column to_upper(Column input, Stream stream=None, DeviceMemoryResource mr=
         Column of strings uppercased from the input column
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
-        c_result = cpp_case.to_upper(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_case.to_upper(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
-cpdef Column swapcase(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column swapcase(Column input, object stream=None, DeviceMemoryResource mr=None):
     """Returns a column of strings where the lowercase characters
     are converted to uppercase and the uppercase characters
     are converted to lowercase.
@@ -88,9 +91,10 @@ cpdef Column swapcase(Column input, Stream stream=None, DeviceMemoryResource mr=
         Column of strings
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
-        c_result = cpp_case.swapcase(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_case.swapcase(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/strings/char_types.pxd
index 009886f3e9f..59c045dba15 100644
--- a/python/pylibcudf/pylibcudf/strings/char_types.pxd
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pxd
@@ -1,18 +1,17 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.strings.char_types cimport string_character_types
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column all_characters_of_type(
     Column source_strings,
     string_character_types types,
     string_character_types verify_types,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
@@ -21,6 +20,6 @@ cpdef Column filter_characters_of_type(
     string_character_types types_to_remove,
     Scalar replacement,
     string_character_types types_to_keep,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyi b/python/pylibcudf/pylibcudf/strings/char_types.pyi
index 12749d79f6d..1740a67eb00 100644
--- a/python/pylibcudf/pylibcudf/strings/char_types.pyi
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyi
@@ -1,13 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 class StringCharacterTypes(IntEnum):
     DECIMAL = ...
@@ -25,7 +25,7 @@ def all_characters_of_type(
     source_strings: Column,
     types: StringCharacterTypes,
     verify_types: StringCharacterTypes,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def filter_characters_of_type(
@@ -33,6 +33,6 @@ def filter_characters_of_type(
     types_to_remove: StringCharacterTypes,
     replacement: Scalar,
     types_to_keep: StringCharacterTypes,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx
index 5cb5025798e..2567ab8ee4b 100644
--- a/python/pylibcudf/pylibcudf/strings/char_types.pyx
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -14,6 +14,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from cython.operator import dereference
+from cuda.bindings.cyruntime cimport cudaStream_t
 from pylibcudf.libcudf.strings.char_types import \
     string_character_types as StringCharacterTypes  # no-cython-lint
 
@@ -27,7 +28,7 @@ cpdef Column all_characters_of_type(
     Column source_strings,
     string_character_types types,
     string_character_types verify_types,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -50,7 +51,8 @@ cpdef Column all_characters_of_type(
         New column of boolean results for each string
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -58,18 +60,18 @@ cpdef Column all_characters_of_type(
             source_strings.view(),
             types,
             verify_types,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column filter_characters_of_type(
     Column source_strings,
     string_character_types types_to_remove,
     Scalar replacement,
     string_character_types types_to_keep,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -99,7 +101,8 @@ cpdef Column filter_characters_of_type(
         replacement.c_obj.get()
     )
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -108,10 +111,10 @@ cpdef Column filter_characters_of_type(
             types_to_remove,
             dereference(c_replacement),
             types_to_keep,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 StringCharacterTypes.__str__ = StringCharacterTypes.__repr__
diff --git a/python/pylibcudf/pylibcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/strings/combine.pxd
index b889169c7c7..32a58abdc23 100644
--- a/python/pylibcudf/pylibcudf/strings/combine.pxd
+++ b/python/pylibcudf/pylibcudf/strings/combine.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
@@ -9,7 +9,6 @@ from pylibcudf.libcudf.strings.combine cimport (
 from pylibcudf.scalar cimport Scalar
 from pylibcudf.table cimport Table
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 ctypedef fused ColumnOrScalar:
     Column
@@ -21,7 +20,7 @@ cpdef Column concatenate(
     Scalar narep=*,
     Scalar col_narep=*,
     separator_on_nulls separate_nulls=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -29,7 +28,7 @@ cpdef Column join_strings(
     Column input,
     Scalar separator,
     Scalar narep,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -40,6 +39,6 @@ cpdef Column join_list_elements(
     Scalar string_narep,
     separator_on_nulls separate_nulls,
     output_if_empty_list empty_list_policy,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyi b/python/pylibcudf/pylibcudf/strings/combine.pyi
index fa568046fa8..3186709996f 100644
--- a/python/pylibcudf/pylibcudf/strings/combine.pyi
+++ b/python/pylibcudf/pylibcudf/strings/combine.pyi
@@ -1,14 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 class SeparatorOnNulls(IntEnum):
     YES = ...
@@ -24,14 +24,14 @@ def concatenate(
     narep: Scalar | None = None,
     col_narep: Scalar | None = None,
     separate_nulls: SeparatorOnNulls = SeparatorOnNulls.YES,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def join_strings(
     input: Column,
     separator: Scalar,
     narep: Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def join_list_elements(
@@ -41,6 +41,6 @@ def join_list_elements(
     string_narep: Scalar,
     separate_nulls: SeparatorOnNulls,
     empty_list_policy: OutputIfEmptyList,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/combine.pyx b/python/pylibcudf/pylibcudf/strings/combine.pyx
index e570a18c585..82903002907 100644
--- a/python/pylibcudf/pylibcudf/strings/combine.pyx
+++ b/python/pylibcudf/pylibcudf/strings/combine.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -16,6 +16,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from cython.operator import dereference
+from cuda.bindings.cyruntime cimport cudaStream_t
 from pylibcudf.libcudf.strings.combine import \
     output_if_empty_list as OutputIfEmptyList  # no-cython-lint
 from pylibcudf.libcudf.strings.combine import \
@@ -35,7 +36,7 @@ cpdef Column concatenate(
     Scalar narep=None,
     Scalar col_narep=None,
     separator_on_nulls separate_nulls=separator_on_nulls.YES,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -68,12 +69,13 @@ cpdef Column concatenate(
     cdef unique_ptr[column] c_result
     cdef const string_scalar* c_col_narep
     cdef const string_scalar* c_separator
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if narep is None:
         narep = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
     cdef const string_scalar* c_narep = <const string_scalar*>(
         narep.c_obj.get()
@@ -82,7 +84,7 @@ cpdef Column concatenate(
     if ColumnOrScalar is Column:
         if col_narep is None:
             col_narep = Scalar.from_libcudf(
-                cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+                cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
             )
         c_col_narep = <const string_scalar*>(
             col_narep.c_obj.get()
@@ -95,7 +97,7 @@ cpdef Column concatenate(
                     dereference(c_narep),
                     dereference(c_col_narep),
                     separate_nulls,
-                    stream.view(),
+                    _cs,
                     mr.get_mr()
                 )
             )
@@ -112,20 +114,20 @@ cpdef Column concatenate(
                     dereference(c_separator),
                     dereference(c_narep),
                     separate_nulls,
-                    stream.view(),
+                    _cs,
                     mr.get_mr()
                 )
             )
     else:
         raise ValueError("separator must be a Column or a Scalar")
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column join_strings(
     Column input,
     Scalar separator,
     Scalar narep,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -149,7 +151,8 @@ cpdef Column join_strings(
         New column containing one string
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     cdef const string_scalar* c_separator = <const string_scalar*>(
         separator.c_obj.get()
@@ -163,12 +166,12 @@ cpdef Column join_strings(
                 input.view(),
                 dereference(c_separator),
                 dereference(c_narep),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column join_list_elements(
@@ -178,7 +181,7 @@ cpdef Column join_list_elements(
     Scalar string_narep,
     separator_on_nulls separate_nulls,
     output_if_empty_list empty_list_policy,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -217,7 +220,8 @@ cpdef Column join_list_elements(
         New strings column with concatenated results
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     cdef const string_scalar* c_separator_narep = <const string_scalar*>(
         separator_narep.c_obj.get()
@@ -237,7 +241,7 @@ cpdef Column join_list_elements(
                     dereference(c_string_narep),
                     separate_nulls,
                     empty_list_policy,
-                    stream.view(),
+                    _cs,
                     mr.get_mr()
                 )
             )
@@ -251,13 +255,13 @@ cpdef Column join_list_elements(
                     dereference(c_separator_narep),
                     separate_nulls,
                     empty_list_policy,
-                    stream.view(),
+                    _cs,
                     mr.get_mr()
                 )
             )
     else:
         raise ValueError("separator must be a Column or a Scalar")
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 OutputIfEmptyList.__str__ = OutputIfEmptyList.__repr__
 SeparatorOnNulls.__str__ = SeparatorOnNulls.__repr__
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd
index b3b0f06efb5..585f2fac1ff 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pxd
+++ b/python/pylibcudf/pylibcudf/strings/contains.pxd
@@ -1,28 +1,27 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.strings.regex_program cimport RegexProgram
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column contains_re(
-    Column input, RegexProgram prog, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, RegexProgram prog, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column count_re(
-    Column input, RegexProgram prog, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, RegexProgram prog, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column matches_re(
-    Column input, RegexProgram prog, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, RegexProgram prog, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column like(
     Column input,
     str pattern,
     str escape_character=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyi b/python/pylibcudf/pylibcudf/strings/contains.pyi
index 3685cf5345a..b751ef0b24c 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pyi
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyi
@@ -1,34 +1,34 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.strings.regex_program import RegexProgram
+from pylibcudf.utils import CudaStreamLike
 
 def contains_re(
     input: Column,
     prog: RegexProgram,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def count_re(
     input: Column,
     prog: RegexProgram,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def matches_re(
     input: Column,
     prog: RegexProgram,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def like(
     input: Column,
     pattern: str,
     escape_character: str | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx
index 8fe74228854..495d1637d8a 100644
--- a/python/pylibcudf/pylibcudf/strings/contains.pyx
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -11,13 +11,14 @@ from pylibcudf.strings.regex_program cimport RegexProgram
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["contains_re", "count_re", "like", "matches_re"]
 
 cpdef Column contains_re(
     Column input,
     RegexProgram prog,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Returns a boolean column identifying rows which match the given
@@ -39,24 +40,27 @@ cpdef Column contains_re(
     """
 
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
+    if _stream is None:
+        _stream = _get_stream(None)
     mr = _get_memory_resource(mr)
 
     with nogil:
         result = cpp_contains.contains_re(
             input.view(),
             prog.c_obj.get()[0],
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column count_re(
     Column input,
     RegexProgram prog,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Returns the number of times the given regex_program's pattern
@@ -78,24 +82,25 @@ cpdef Column count_re(
     """
 
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         result = cpp_contains.count_re(
             input.view(),
             prog.c_obj.get()[0],
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column matches_re(
     Column input,
     RegexProgram prog,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Returns a boolean column identifying rows which
@@ -118,25 +123,26 @@ cpdef Column matches_re(
     """
 
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         result = cpp_contains.matches_re(
             input.view(),
             prog.c_obj.get()[0],
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column like(
     Column input,
     str pattern,
     str escape_character=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -161,7 +167,8 @@ cpdef Column like(
         New column of boolean results for each string
     """
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if escape_character is None:
@@ -175,9 +182,9 @@ cpdef Column like(
             input.view(),
             c_pattern,
             c_escape_character,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    stream.synchronize()
+    _stream.synchronize()
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd
index cc1206cf29b..0929544287f 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pxd
@@ -1,20 +1,19 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column to_booleans(
-    Column input, Scalar true_string, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, Scalar true_string, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column from_booleans(
     Column booleans,
     Scalar true_string,
     Scalar false_string,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi
index 608b47bad8c..10c7b96bfc0 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyi
@@ -1,22 +1,22 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 def to_booleans(
     input: Column,
     true_string: Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def from_booleans(
     booleans: Column,
     true_string: Scalar,
     false_string: Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
index 6f7965f8a3b..e8f963cf0f3 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_booleans.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -15,11 +15,12 @@ from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from cython.operator import dereference
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["from_booleans", "to_booleans"]
 
 cpdef Column to_booleans(
-    Column input, Scalar true_string, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, Scalar true_string, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a new bool column by parsing boolean values from the strings
@@ -47,24 +48,25 @@ cpdef Column to_booleans(
     cdef const string_scalar* c_true_string = <const string_scalar*>(
         true_string.c_obj.get()
     )
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_convert_booleans.to_booleans(
             input.view(),
             dereference(c_true_string),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column from_booleans(
     Column booleans,
     Scalar true_string,
     Scalar false_string,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -99,7 +101,8 @@ cpdef Column from_booleans(
     cdef const string_scalar* c_false_string = <const string_scalar*>(
         false_string.c_obj.get()
     )
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -107,8 +110,8 @@ cpdef Column from_booleans(
             booleans.view(),
             dereference(c_true_string),
             dereference(c_false_string),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
index 407eb06ce6a..d0a5d2fc829 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pxd
@@ -1,18 +1,17 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.string cimport string
 from pylibcudf.column cimport Column
 from pylibcudf.types cimport DataType
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column to_timestamps(
     Column input,
     DataType timestamp_type,
     str format,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
@@ -20,13 +19,13 @@ cpdef Column from_timestamps(
     Column timestamps,
     str format,
     Column input_strings_names,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
 cpdef Column is_timestamp(
     Column input,
     str format,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi
index 5fdc863705d..99f067ecb04 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyi
@@ -1,29 +1,29 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 def to_timestamps(
     input: Column,
     timestamp_type: DataType,
     format: str,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def from_timestamps(
     timestamps: Column,
     format: str,
     input_strings_names: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_timestamp(
     input: Column,
     format: str,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
index 07b35de7c54..633445a7383 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_datetime.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -14,6 +14,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from pylibcudf.types import DataType
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["from_timestamps", "is_timestamp", "to_timestamps"]
 
@@ -21,7 +22,7 @@ cpdef Column to_timestamps(
     Column input,
     DataType timestamp_type,
     str format,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -51,24 +52,25 @@ cpdef Column to_timestamps(
     """
     cdef unique_ptr[column] c_result
     cdef string c_format = format.encode()
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
         c_result = cpp_convert_datetime.to_timestamps(
             input.view(),
             timestamp_type.c_obj,
             c_format,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column from_timestamps(
     Column timestamps,
     str format,
     Column input_strings_names,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -98,23 +100,24 @@ cpdef Column from_timestamps(
     """
     cdef unique_ptr[column] c_result
     cdef string c_format = format.encode()
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
         c_result = cpp_convert_datetime.from_timestamps(
             timestamps.view(),
             c_format,
             input_strings_names.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column is_timestamp(
     Column input,
     str format,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -141,14 +144,15 @@ cpdef Column is_timestamp(
     """
     cdef unique_ptr[column] c_result
     cdef string c_format = format.encode()
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
         c_result = cpp_convert_datetime.is_timestamp(
             input.view(),
             c_format,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
index 62b372d0af4..a912d939a83 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pxd
@@ -1,24 +1,23 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.string cimport string
 from pylibcudf.column cimport Column
 from pylibcudf.types cimport DataType
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column to_durations(
     Column input,
     DataType duration_type,
     str format,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
 cpdef Column from_durations(
     Column durations,
     str format=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi
index 95ba392ec94..ac9fd9825dc 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyi
@@ -1,22 +1,22 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 def to_durations(
     input: Column,
     duration_type: DataType,
     format: str,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def from_durations(
     durations: Column,
     format: str | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
index 9bf8eb96009..548df7398b4 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_durations.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -14,6 +14,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from pylibcudf.types import DataType
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["from_durations", "to_durations"]
 
@@ -21,7 +22,7 @@ cpdef Column to_durations(
     Column input,
     DataType duration_type,
     str format,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """
@@ -51,7 +52,8 @@ cpdef Column to_durations(
     """
     cdef unique_ptr[column] c_result
     cdef string c_format = format.encode()
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -59,16 +61,16 @@ cpdef Column to_durations(
             input.view(),
             duration_type.c_obj,
             c_format,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column from_durations(
     Column durations,
     str format=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None
 ):
     """
@@ -95,7 +97,8 @@ cpdef Column from_durations(
         New strings column with formatted durations.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if format is None:
@@ -106,8 +109,8 @@ cpdef Column from_durations(
         c_result = cpp_convert_durations.from_durations(
             durations.view(),
             c_format,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd
index 046556db181..439f8884008 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pxd
@@ -1,26 +1,25 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.types cimport DataType
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column to_fixed_point(
     Column input,
     DataType output_type,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
 cpdef Column from_fixed_point(
-    Column input, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column is_fixed_point(
     Column input,
     DataType decimal_type=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi
index 7269f970069..a9d4a0eac98 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyi
@@ -1,26 +1,26 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 def to_fixed_point(
     input: Column,
     output_type: DataType,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def from_fixed_point(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_fixed_point(
     input: Column,
     decimal_type: DataType | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
index 13020a5ee73..059373790c5 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_fixed_point.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -12,12 +12,13 @@ from pylibcudf.types cimport DataType, type_id
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["from_fixed_point", "is_fixed_point", "to_fixed_point"]
 
 
 cpdef Column to_fixed_point(
-    Column input, DataType output_type, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, DataType output_type, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a new fixed-point column parsing decimal values from the
@@ -42,21 +43,22 @@ cpdef Column to_fixed_point(
         New column of output_type.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_fixed_point.to_fixed_point(
             input.view(),
             output_type.c_obj,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column from_fixed_point(
-    Column input, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a new strings column converting the fixed-point values
@@ -78,20 +80,21 @@ cpdef Column from_fixed_point(
         New strings column.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_fixed_point.from_fixed_point(
-            input.view(), stream.view(), mr.get_mr()
+            input.view(), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column is_fixed_point(
     Column input,
     DataType decimal_type=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -118,7 +121,8 @@ cpdef Column is_fixed_point(
         New column of boolean results for each string.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if decimal_type is None:
@@ -128,8 +132,8 @@ cpdef Column is_fixed_point(
         c_result = cpp_fixed_point.is_fixed_point(
             input.view(),
             decimal_type.c_obj,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
index a2b98fa0b74..0d394fa1fe7 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pxd
@@ -1,16 +1,15 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.types cimport DataType
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column to_floats(
-    Column strings, DataType output_type, Stream stream=*, DeviceMemoryResource mr=*
+    Column strings, DataType output_type, object stream = *, DeviceMemoryResource mr=*
 )
 
-cpdef Column from_floats(Column floats, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef Column from_floats(Column floats, object stream = *, DeviceMemoryResource mr=*)
 
-cpdef Column is_float(Column input, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef Column is_float(Column input, object stream = *, DeviceMemoryResource mr=*)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi
index b5c8d7e7497..b334dfef9c7 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyi
@@ -1,25 +1,25 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 def to_floats(
     strings: Column,
     output_type: DataType,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def from_floats(
     floats: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_float(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
index 59ac17a3e1c..d4901ce7be6 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_floats.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -12,13 +12,14 @@ from pylibcudf.types cimport DataType
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["from_floats", "is_float", "to_floats"]
 
 cpdef Column to_floats(
     Column strings,
     DataType output_type,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -44,22 +45,23 @@ cpdef Column to_floats(
         New column with floats converted from strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_convert_floats.to_floats(
             strings.view(),
             output_type.c_obj,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column from_floats(
-    Column floats, Stream stream=None, DeviceMemoryResource mr=None
+    Column floats, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a new strings column converting the float values from the
@@ -81,18 +83,19 @@ cpdef Column from_floats(
         New strings column with floats as strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_convert_floats.from_floats(
-            floats.view(), stream.view(), mr.get_mr()
+            floats.view(), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
-cpdef Column is_float(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column is_float(Column input, object stream=None, DeviceMemoryResource mr=None):
     """
     Returns a boolean column identifying strings in which all
     characters are valid for conversion to floats.
@@ -113,10 +116,13 @@ cpdef Column is_float(Column input, Stream stream=None, DeviceMemoryResource mr=
         New column of boolean results for each string.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_convert_floats.is_float(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_convert_floats.is_float(
+            input.view(), _cs, mr.get_mr()
+        )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd
index 376081e9b20..059e8c31f19 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pxd
@@ -1,32 +1,31 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.types cimport DataType
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column to_integers(
-    Column input, DataType output_type, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, DataType output_type, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column from_integers(
-    Column integers, Stream stream=*, DeviceMemoryResource mr=*
+    Column integers, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column is_integer(
-    Column input, DataType int_type=*, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, DataType int_type=*, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column hex_to_integers(
-    Column input, DataType output_type, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, DataType output_type, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column is_hex(
-    Column input, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column integers_to_hex(
-    Column input, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, object stream = *, DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi
index 4625ee5e883..88a66350466 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyi
@@ -1,42 +1,42 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 def to_integers(
     input: Column,
     output_type: DataType,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def from_integers(
     integers: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_integer(
     input: Column,
     int_type: DataType | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def hex_to_integers(
     input: Column,
     output_type: DataType,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_hex(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def integers_to_hex(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
index c5945e5e1e5..b717ddbbcda 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_integers.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -12,6 +12,7 @@ from pylibcudf.types cimport DataType
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "from_integers",
@@ -23,7 +24,7 @@ __all__ = [
 ]
 
 cpdef Column to_integers(
-    Column input, DataType output_type, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, DataType output_type, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a new integer numeric column parsing integer values from the
@@ -48,7 +49,8 @@ cpdef Column to_integers(
         New column with integers converted from strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -56,16 +58,16 @@ cpdef Column to_integers(
             cpp_convert_integers.to_integers(
                 input.view(),
                 output_type.c_obj,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column from_integers(
-    Column integers, Stream stream=None, DeviceMemoryResource mr=None
+    Column integers, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a new strings column converting the integer values from the
@@ -87,25 +89,26 @@ cpdef Column from_integers(
         New strings column with integers as strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = move(
             cpp_convert_integers.from_integers(
                 integers.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column is_integer(
     Column input,
     DataType int_type=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -133,7 +136,8 @@ cpdef Column is_integer(
         New column of boolean results for each string.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if int_type is None:
@@ -141,7 +145,7 @@ cpdef Column is_integer(
             c_result = move(
                 cpp_convert_integers.is_integer(
                     input.view(),
-                    stream.view(),
+                    _cs,
                     mr.get_mr()
                 )
             )
@@ -151,16 +155,16 @@ cpdef Column is_integer(
                 cpp_convert_integers.is_integer(
                     input.view(),
                     int_type.c_obj,
-                    stream.view(),
+                    _cs,
                     mr.get_mr()
                 )
             )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column hex_to_integers(
-    Column input, DataType output_type, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, DataType output_type, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a new integer numeric column parsing hexadecimal values
@@ -185,7 +189,8 @@ cpdef Column hex_to_integers(
         New column with integers converted from strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -193,15 +198,15 @@ cpdef Column hex_to_integers(
             cpp_convert_integers.hex_to_integers(
                 input.view(),
                 output_type.c_obj,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
-cpdef Column is_hex(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column is_hex(Column input, object stream=None, DeviceMemoryResource mr=None):
     """
     Returns a boolean column identifying strings in which all
     characters are valid for conversion to integers from hex.
@@ -222,23 +227,24 @@ cpdef Column is_hex(Column input, Stream stream=None, DeviceMemoryResource mr=No
         New column of boolean results for each string.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = move(
             cpp_convert_integers.is_hex(
                 input.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column integers_to_hex(
-    Column input, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a new strings column converting integer columns to hexadecimal
@@ -260,16 +266,17 @@ cpdef Column integers_to_hex(
         New strings column with hexadecimal characters.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = move(
             cpp_convert_integers.integers_to_hex(
                 input.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd
index 53a3927af41..04df2862c31 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pxd
@@ -1,19 +1,18 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column ipv4_to_integers(
-    Column input, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column integers_to_ipv4(
-    Column integers, Stream stream=*, DeviceMemoryResource mr=*
+    Column integers, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column is_ipv4(
-    Column input, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, object stream = *, DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi
index 86a969a4021..16e4d8d990a 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyi
@@ -1,23 +1,23 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def ipv4_to_integers(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def integers_to_ipv4(
     integers: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_ipv4(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
index 72021e85a9d..45b98190aa7 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_ipv4.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -9,11 +9,12 @@ from pylibcudf.libcudf.strings.convert cimport convert_ipv4 as cpp_convert_ipv4
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["integers_to_ipv4", "ipv4_to_integers", "is_ipv4"]
 
 cpdef Column ipv4_to_integers(
-    Column input, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Converts IPv4 addresses into integers.
@@ -34,19 +35,20 @@ cpdef Column ipv4_to_integers(
         New uint32 column converted from strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_convert_ipv4.ipv4_to_integers(
-            input.view(), stream.view(), mr.get_mr()
+            input.view(), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column integers_to_ipv4(
-    Column integers, Stream stream=None, DeviceMemoryResource mr=None
+    Column integers, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Converts integers into IPv4 addresses as strings.
@@ -67,18 +69,19 @@ cpdef Column integers_to_ipv4(
         New strings column.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_convert_ipv4.integers_to_ipv4(
-            integers.view(), stream.view(), mr.get_mr()
+            integers.view(), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
-cpdef Column is_ipv4(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column is_ipv4(Column input, object stream=None, DeviceMemoryResource mr=None):
     """
     Returns a boolean column identifying strings in which all
     characters are valid for conversion to integers from IPv4 format.
@@ -99,10 +102,11 @@ cpdef Column is_ipv4(Column input, Stream stream=None, DeviceMemoryResource mr=N
         New column of boolean results for each string.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_convert_ipv4.is_ipv4(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_convert_ipv4.is_ipv4(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
index a2dcc15dacd..c25cf9d7146 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pxd
@@ -1,16 +1,15 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column format_list_column(
     Column input,
     Scalar na_rep=*,
     Column separators=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi
index cf301dd9a1b..29f94a30123 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyi
@@ -1,16 +1,16 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 def format_list_column(
     input: Column,
     na_rep: Scalar | None = None,
     separators: Column | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
index 79648efcc3f..9c8f9d7b02e 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_lists.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -20,6 +20,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from cython.operator import dereference
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["format_list_column"]
 
@@ -27,7 +28,7 @@ cpdef Column format_list_column(
     Column input,
     Scalar na_rep=None,
     Column separators=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -58,12 +59,13 @@ cpdef Column format_list_column(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if na_rep is None:
         na_rep = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
 
     cdef const string_scalar* c_na_rep = <const string_scalar*>(
@@ -78,8 +80,8 @@ cpdef Column format_list_column(
             input.view(),
             dereference(c_na_rep),
             separators.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
index dce44f5e547..56b1f803d38 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pxd
@@ -1,15 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column url_encode(
-    Column Input, Stream stream=*, DeviceMemoryResource mr=*
+    Column Input, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column url_decode(
-    Column Input, Stream stream=*, DeviceMemoryResource mr=*
+    Column Input, object stream = *, DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi
index 6a248cdc974..8707da953b5 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyi
@@ -1,18 +1,18 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def url_encode(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def url_decode(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
index 30ca51f27f7..efe009e6c02 100644
--- a/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
+++ b/python/pylibcudf/pylibcudf/strings/convert/convert_urls.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -10,10 +10,11 @@ from pylibcudf.utils cimport _get_stream, _get_memory_resource
 
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["url_decode", "url_encode"]
 
-cpdef Column url_encode(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column url_encode(Column input, object stream=None, DeviceMemoryResource mr=None):
     """
     Encodes each string using URL encoding.
 
@@ -33,16 +34,19 @@ cpdef Column url_encode(Column input, Stream stream=None, DeviceMemoryResource m
         New strings column.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_convert_urls.url_encode(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_convert_urls.url_encode(
+            input.view(), _cs, mr.get_mr()
+        )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
-cpdef Column url_decode(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column url_decode(Column input, object stream=None, DeviceMemoryResource mr=None):
     """
     Decodes each string using URL encoding.
 
@@ -62,10 +66,13 @@ cpdef Column url_decode(Column input, Stream stream=None, DeviceMemoryResource m
         New strings column.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_convert_urls.url_decode(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_convert_urls.url_decode(
+            input.view(), _cs, mr.get_mr()
+        )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/strings/extract.pxd
index c8fcb900d2b..85f722970c8 100644
--- a/python/pylibcudf/pylibcudf/strings/extract.pxd
+++ b/python/pylibcudf/pylibcudf/strings/extract.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
@@ -6,21 +6,20 @@ from pylibcudf.strings.regex_program cimport RegexProgram
 from pylibcudf.table cimport Table
 from pylibcudf.libcudf.types cimport size_type
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Table extract(
-    Column input, RegexProgram prog, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, RegexProgram prog, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column extract_all_record(
-    Column input, RegexProgram prog, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, RegexProgram prog, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column extract_single(
     Column input,
     RegexProgram prog,
     size_type group,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyi b/python/pylibcudf/pylibcudf/strings/extract.pyi
index 853420a8091..a9607266bbc 100644
--- a/python/pylibcudf/pylibcudf/strings/extract.pyi
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyi
@@ -1,29 +1,29 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.strings.regex_program import RegexProgram
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 def extract(
     input: Column,
     prog: RegexProgram,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def extract_all_record(
     input: Column,
     prog: RegexProgram,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def extract_single(
     input: Column,
     prog: RegexProgram,
     group: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/extract.pyx b/python/pylibcudf/pylibcudf/strings/extract.pyx
index bac20c2cd15..c670b226e84 100644
--- a/python/pylibcudf/pylibcudf/strings/extract.pyx
+++ b/python/pylibcudf/pylibcudf/strings/extract.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -13,11 +13,12 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["extract", "extract_all_record", "extract_single"]
 
 cpdef Table extract(
-    Column input, RegexProgram prog, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, RegexProgram prog, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a table of strings columns where each column
@@ -41,22 +42,23 @@ cpdef Table extract(
         Columns of strings extracted from the input column.
     """
     cdef unique_ptr[table] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_extract.extract(
             input.view(),
             prog.c_obj.get()[0],
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column extract_all_record(
-    Column input, RegexProgram prog, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, RegexProgram prog, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a lists column of strings where each string column
@@ -80,25 +82,26 @@ cpdef Column extract_all_record(
         Lists column containing strings extracted from the input column
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_extract.extract_all_record(
             input.view(),
             prog.c_obj.get()[0],
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column extract_single(
     Column input,
     RegexProgram prog,
     size_type group,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -124,7 +127,8 @@ cpdef Column extract_single(
         Column of strings extracted from the input column
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -132,8 +136,8 @@ cpdef Column extract_single(
             input.view(),
             prog.c_obj.get()[0],
             group,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/find.pxd b/python/pylibcudf/pylibcudf/strings/find.pxd
index 3ec32563c5a..1a04cf4eca2 100644
--- a/python/pylibcudf/pylibcudf/strings/find.pxd
+++ b/python/pylibcudf/pylibcudf/strings/find.pxd
@@ -1,11 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 ctypedef fused ColumnOrScalar:
     Column
@@ -16,7 +15,7 @@ cpdef Column find(
     ColumnOrScalar target,
     size_type start=*,
     size_type stop=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
@@ -25,27 +24,27 @@ cpdef Column rfind(
     Scalar target,
     size_type start=*,
     size_type stop=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column contains(
     Column input,
     ColumnOrScalar target,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column starts_with(
     Column input,
     ColumnOrScalar target,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column ends_with(
     Column input,
     ColumnOrScalar target,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/find.pyi b/python/pylibcudf/pylibcudf/strings/find.pyi
index a566fbdd72a..a8b3ca1da7c 100644
--- a/python/pylibcudf/pylibcudf/strings/find.pyi
+++ b/python/pylibcudf/pylibcudf/strings/find.pyi
@@ -1,18 +1,18 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 def find(
     input: Column,
     target: Column | Scalar,
     start: int = 0,
     stop: int = -1,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def rfind(
@@ -20,24 +20,24 @@ def rfind(
     target: Scalar,
     start: int = 0,
     stop: int = -1,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def contains(
     input: Column,
     target: Column | Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def starts_with(
     input: Column,
     target: Column | Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def ends_with(
     input: Column,
     target: Column | Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx
index 7323a924342..102a8787651 100644
--- a/python/pylibcudf/pylibcudf/strings/find.pyx
+++ b/python/pylibcudf/pylibcudf/strings/find.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -14,6 +14,7 @@ from rmm.pylibrmm.stream cimport Stream
 from cython.operator import dereference
 
 from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["contains", "ends_with", "find", "rfind", "starts_with"]
 
@@ -22,7 +23,7 @@ cpdef Column find(
     ColumnOrScalar target,
     size_type start=0,
     size_type stop=-1,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Returns a column of character position values where the target string is
@@ -58,7 +59,8 @@ cpdef Column find(
         New integer column with character position values
     """
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     if ColumnOrScalar is Column:
         with nogil:
@@ -66,7 +68,7 @@ cpdef Column find(
                 input.view(),
                 target.view(),
                 start,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     elif ColumnOrScalar is Scalar:
@@ -76,13 +78,13 @@ cpdef Column find(
                 dereference(<string_scalar*>(target.c_obj.get())),
                 start,
                 stop,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
         raise ValueError(f"Invalid target {target}")
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column rfind(
@@ -90,7 +92,7 @@ cpdef Column rfind(
     Scalar target,
     size_type start=0,
     size_type stop=-1,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -119,7 +121,8 @@ cpdef Column rfind(
         New integer column with character position values
     """
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
         result = cpp_find.rfind(
@@ -127,16 +130,16 @@ cpdef Column rfind(
             dereference(<string_scalar*>(target.c_obj.get())),
             start,
             stop,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column contains(
     Column input,
     ColumnOrScalar target,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -169,14 +172,15 @@ cpdef Column contains(
         New boolean column with True for each string that contains the target
     """
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     if ColumnOrScalar is Column:
         with nogil:
             result = cpp_find.contains(
                 input.view(),
                 target.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     elif ColumnOrScalar is Scalar:
@@ -184,19 +188,19 @@ cpdef Column contains(
             result = cpp_find.contains(
                 input.view(),
                 dereference(<string_scalar*>(target.c_obj.get())),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
         raise ValueError(f"Invalid target {target}")
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column starts_with(
     Column input,
     ColumnOrScalar target,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -229,7 +233,8 @@ cpdef Column starts_with(
         New boolean column with True for each string that starts with the target
     """
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if ColumnOrScalar is Column:
@@ -237,7 +242,7 @@ cpdef Column starts_with(
             result = cpp_find.starts_with(
                 input.view(),
                 target.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     elif ColumnOrScalar is Scalar:
@@ -245,18 +250,18 @@ cpdef Column starts_with(
             result = cpp_find.starts_with(
                 input.view(),
                 dereference(<string_scalar*>(target.c_obj.get())),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
         raise ValueError(f"Invalid target {target}")
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef Column ends_with(
     Column input,
     ColumnOrScalar target,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -288,14 +293,15 @@ cpdef Column ends_with(
         New boolean column with True for each string that ends with the target
     """
     cdef unique_ptr[column] result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     if ColumnOrScalar is Column:
         with nogil:
             result = cpp_find.ends_with(
                 input.view(),
                 target.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     elif ColumnOrScalar is Scalar:
@@ -303,10 +309,10 @@ cpdef Column ends_with(
             result = cpp_find.ends_with(
                 input.view(),
                 dereference(<string_scalar*>(target.c_obj.get())),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
         raise ValueError(f"Invalid target {target}")
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd
index f6677607c5e..e01cb33fdb8 100644
--- a/python/pylibcudf/pylibcudf/strings/find_multiple.pxd
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pxd
@@ -1,21 +1,20 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.table cimport Table
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column find_multiple(
     Column input,
     Column targets,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 cpdef Table contains_multiple(
     Column input,
     Column targets,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyi b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi
index 48de0eac0e1..76115cd7496 100644
--- a/python/pylibcudf/pylibcudf/strings/find_multiple.pyi
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyi
@@ -1,21 +1,21 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 def find_multiple(
     input: Column,
     targets: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def contains_multiple(
     input: Column,
     targets: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
index e18b178f803..ed5f0d78506 100644
--- a/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
+++ b/python/pylibcudf/pylibcudf/strings/find_multiple.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -11,13 +11,14 @@ from pylibcudf.table cimport Table
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["find_multiple", "contains_multiple"]
 
 cpdef Column find_multiple(
     Column input,
     Column targets,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -41,24 +42,25 @@ cpdef Column find_multiple(
         Lists column with character position values
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_find_multiple.find_multiple(
             input.view(),
             targets.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table contains_multiple(
     Column input,
     Column targets,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -82,15 +84,16 @@ cpdef Table contains_multiple(
         Columns of booleans
     """
     cdef unique_ptr[table] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_find_multiple.contains_multiple(
             input.view(),
             targets.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/strings/findall.pxd
index 2dc75fa6d34..ec7e01f7539 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pxd
+++ b/python/pylibcudf/pylibcudf/strings/findall.pxd
@@ -1,15 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.strings.regex_program cimport RegexProgram
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column find_re(
-    Column input, RegexProgram pattern, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, RegexProgram pattern, object stream = *, DeviceMemoryResource mr=*
 )
 cpdef Column findall(
-    Column input, RegexProgram pattern, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, RegexProgram pattern, object stream = *, DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyi b/python/pylibcudf/pylibcudf/strings/findall.pyi
index 5677a99d325..f72e786cf1d 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pyi
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyi
@@ -1,21 +1,21 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.strings.regex_program import RegexProgram
+from pylibcudf.utils import CudaStreamLike
 
 def find_re(
     input: Column,
     pattern: RegexProgram,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def findall(
     input: Column,
     pattern: RegexProgram,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx
index 881664faced..5647a791ef1 100644
--- a/python/pylibcudf/pylibcudf/strings/findall.pyx
+++ b/python/pylibcudf/pylibcudf/strings/findall.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -10,11 +10,12 @@ from pylibcudf.strings.regex_program cimport RegexProgram
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["findall", "find_re"]
 
 cpdef Column findall(
-    Column input, RegexProgram pattern, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, RegexProgram pattern, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns a lists column of strings for each matching occurrence using
@@ -37,22 +38,23 @@ cpdef Column findall(
         New lists column of strings
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_findall.findall(
             input.view(),
             pattern.c_obj.get()[0],
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column find_re(
-    Column input, RegexProgram pattern, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, RegexProgram pattern, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Returns character positions where the pattern first matches
@@ -75,15 +77,16 @@ cpdef Column find_re(
         New column of integers
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_findall.find_re(
             input.view(),
             pattern.c_obj.get()[0],
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/strings/padding.pxd
index 1dfbbd9950f..61dcaf7cba9 100644
--- a/python/pylibcudf/pylibcudf/strings/padding.pxd
+++ b/python/pylibcudf/pylibcudf/strings/padding.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.string cimport string
@@ -6,7 +6,6 @@ from pylibcudf.column cimport Column
 from pylibcudf.libcudf.strings.side_type cimport side_type
 from pylibcudf.libcudf.types cimport size_type
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column pad(
@@ -14,14 +13,14 @@ cpdef Column pad(
     size_type width,
     side_type side,
     str fill_char,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column zfill(
-    Column input, size_type width, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, size_type width, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column zfill_by_widths(
-    Column input, Column widths, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, Column widths, object stream = *, DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyi b/python/pylibcudf/pylibcudf/strings/padding.pyi
index 26af5429acb..904b0022317 100644
--- a/python/pylibcudf/pylibcudf/strings/padding.pyi
+++ b/python/pylibcudf/pylibcudf/strings/padding.pyi
@@ -1,29 +1,29 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.strings.side_type import SideType
+from pylibcudf.utils import CudaStreamLike
 
 def pad(
     input: Column,
     width: int,
     side: SideType,
     fill_char: str,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def zfill(
     input: Column,
     width: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def zfill_by_widths(
     input: Column,
     widths: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/padding.pyx b/python/pylibcudf/pylibcudf/strings/padding.pyx
index 9409970b075..d8eb4f1da4a 100644
--- a/python/pylibcudf/pylibcudf/strings/padding.pyx
+++ b/python/pylibcudf/pylibcudf/strings/padding.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -10,6 +10,7 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["pad", "zfill", "zfill_by_widths"]
 
@@ -18,7 +19,7 @@ cpdef Column pad(
     size_type width,
     side_type side,
     str fill_char,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -46,7 +47,8 @@ cpdef Column pad(
     """
     cdef unique_ptr[column] c_result
     cdef string c_fill_char = fill_char.encode("utf-8")
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -55,14 +57,14 @@ cpdef Column pad(
             width,
             side,
             c_fill_char,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column zfill(
-    Column input, size_type width, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, size_type width, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Add '0' as padding to the left of each string.
@@ -84,21 +86,22 @@ cpdef Column zfill(
         New column of strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_padding.zfill(
             input.view(),
             width,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column zfill_by_widths(
-    Column input, Column widths, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, Column widths, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Add '0' as padding to the left of each string.
@@ -120,15 +123,16 @@ cpdef Column zfill_by_widths(
         New column of strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_padding.zfill_by_widths(
             input.view(),
             widths.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/strings/repeat.pxd
index f1abe23ce59..60725aa688e 100644
--- a/python/pylibcudf/pylibcudf/strings/repeat.pxd
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pxd
@@ -1,10 +1,9 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.types cimport size_type
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 ctypedef fused ColumnorSizeType:
     Column
@@ -13,6 +12,6 @@ ctypedef fused ColumnorSizeType:
 cpdef Column repeat_strings(
     Column input,
     ColumnorSizeType repeat_times,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyi b/python/pylibcudf/pylibcudf/strings/repeat.pyi
index 5b47213e956..fedb7dee76c 100644
--- a/python/pylibcudf/pylibcudf/strings/repeat.pyi
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pyi
@@ -1,14 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def repeat_strings(
     input: Column,
     repeat_times: Column | int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/repeat.pyx b/python/pylibcudf/pylibcudf/strings/repeat.pyx
index 84a305bf866..7a9c5285d02 100644
--- a/python/pylibcudf/pylibcudf/strings/repeat.pyx
+++ b/python/pylibcudf/pylibcudf/strings/repeat.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -11,13 +11,14 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from ..utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["repeat_strings"]
 
 cpdef Column repeat_strings(
     Column input,
     ColumnorSizeType repeat_times,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -44,7 +45,8 @@ cpdef Column repeat_strings(
         New column containing the repeated strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if ColumnorSizeType is Column:
@@ -52,7 +54,7 @@ cpdef Column repeat_strings(
             c_result = cpp_repeat.repeat_strings(
                 input.view(),
                 repeat_times.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     elif ColumnorSizeType is size_type:
@@ -60,10 +62,10 @@ cpdef Column repeat_strings(
             c_result = cpp_repeat.repeat_strings(
                 input.view(),
                 repeat_times,
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
         raise ValueError("repeat_times must be size_type or integer")
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/replace.pxd b/python/pylibcudf/pylibcudf/strings/replace.pxd
index a486869aada..aea2296b5f9 100644
--- a/python/pylibcudf/pylibcudf/strings/replace.pxd
+++ b/python/pylibcudf/pylibcudf/strings/replace.pxd
@@ -1,11 +1,10 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column replace(
@@ -13,7 +12,7 @@ cpdef Column replace(
     Scalar target,
     Scalar repl,
     size_type maxrepl=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 cpdef Column replace_multiple(
@@ -21,7 +20,7 @@ cpdef Column replace_multiple(
     Column target,
     Column repl,
     size_type maxrepl=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
 cpdef Column replace_slice(
@@ -29,6 +28,6 @@ cpdef Column replace_slice(
     Scalar repl=*,
     size_type start=*,
     size_type stop=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyi b/python/pylibcudf/pylibcudf/strings/replace.pyi
index 3e62a76d2bf..0e76eb402f7 100644
--- a/python/pylibcudf/pylibcudf/strings/replace.pyi
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyi
@@ -1,18 +1,18 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 def replace(
     input: Column,
     target: Scalar,
     repl: Scalar,
     maxrepl: int = -1,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def replace_multiple(
@@ -20,7 +20,7 @@ def replace_multiple(
     target: Column,
     repl: Column,
     maxrepl: int = -1,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def replace_slice(
@@ -28,6 +28,6 @@ def replace_slice(
     repl: Scalar | None = None,
     start: int = 0,
     stop: int = -1,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx
index e1d88fed464..ccd6c924441 100644
--- a/python/pylibcudf/pylibcudf/strings/replace.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -19,6 +19,7 @@ from pylibcudf.scalar cimport Scalar
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["replace", "replace_multiple", "replace_slice"]
 
@@ -27,7 +28,7 @@ cpdef Column replace(
     Scalar target,
     Scalar repl,
     size_type maxrepl=-1,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Replaces target string within each string with the specified replacement string.
@@ -60,7 +61,8 @@ cpdef Column replace(
 
     target_str = <string_scalar *>(target.c_obj.get())
     repl_str = <string_scalar *>(repl.c_obj.get())
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -69,11 +71,11 @@ cpdef Column replace(
             target_str[0],
             repl_str[0],
             maxrepl,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column replace_multiple(
@@ -81,7 +83,7 @@ cpdef Column replace_multiple(
     Column target,
     Column repl,
     size_type maxrepl=-1,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Replaces target string within each string with the specified replacement string.
@@ -109,7 +111,8 @@ cpdef Column replace_multiple(
         New string column with target replaced.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -117,11 +120,11 @@ cpdef Column replace_multiple(
             input.view(),
             target.view(),
             repl.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column replace_slice(
@@ -131,7 +134,7 @@ cpdef Column replace_slice(
     Scalar repl=None,
     size_type start=0,
     size_type stop=-1,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Replaces each string in the column with the provided repl string
@@ -162,12 +165,13 @@ cpdef Column replace_slice(
         New string column
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if repl is None:
         repl = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
 
     cdef const string_scalar* scalar_str = <string_scalar*>(repl.c_obj.get())
@@ -178,8 +182,8 @@ cpdef Column replace_slice(
             scalar_str[0],
             start,
             stop,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/strings/replace_re.pxd
index fc833a61045..0d360f8de6f 100644
--- a/python/pylibcudf/pylibcudf/strings/replace_re.pxd
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pxd
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
@@ -7,7 +7,6 @@ from pylibcudf.scalar cimport Scalar
 from pylibcudf.strings.regex_flags cimport regex_flags
 from pylibcudf.strings.regex_program cimport RegexProgram
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 ctypedef fused Replacement:
     Column
@@ -24,7 +23,7 @@ cpdef Column replace_re(
     Replacement replacement=*,
     size_type max_replace_count=*,
     regex_flags flags=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
 
@@ -32,6 +31,6 @@ cpdef Column replace_with_backrefs(
     Column input,
     RegexProgram prog,
     str replacement,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyi b/python/pylibcudf/pylibcudf/strings/replace_re.pyi
index 29f8ddfe925..64970928323 100644
--- a/python/pylibcudf/pylibcudf/strings/replace_re.pyi
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyi
@@ -1,15 +1,15 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from typing import overload
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.strings.regex_flags import RegexFlags
 from pylibcudf.strings.regex_program import RegexProgram
+from pylibcudf.utils import CudaStreamLike
 
 @overload
 def replace_re(
@@ -17,7 +17,7 @@ def replace_re(
     pattern: RegexProgram,
     replacement: Scalar,
     max_replace_count: int = -1,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 @overload
@@ -27,13 +27,13 @@ def replace_re(
     replacement: Column,
     max_replace_count: int = -1,
     flags: RegexFlags = RegexFlags.DEFAULT,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def replace_with_backrefs(
     input: Column,
     prog: RegexProgram,
     replacement: str,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/replace_re.pyx b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
index 1819dd0ba2b..60e9c4c1666 100644
--- a/python/pylibcudf/pylibcudf/strings/replace_re.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace_re.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
@@ -19,6 +19,7 @@ from pylibcudf.strings.regex_program cimport RegexProgram
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["replace_re", "replace_with_backrefs"]
 
@@ -28,7 +29,7 @@ cpdef Column replace_re(
     Replacement replacement=None,
     size_type max_replace_count=-1,
     regex_flags flags=regex_flags.DEFAULT,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -64,13 +65,14 @@ cpdef Column replace_re(
     """
     cdef unique_ptr[column] c_result
     cdef vector[string] c_patterns
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if Patterns is RegexProgram and Replacement is Scalar:
         if replacement is None:
             replacement = Scalar.from_libcudf(
-                cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+                cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
             )
         with nogil:
             c_result = move(
@@ -79,12 +81,12 @@ cpdef Column replace_re(
                     patterns.c_obj.get()[0],
                     dereference(<string_scalar*>(replacement.get())),
                     max_replace_count,
-                    stream.view(),
+                    _cs,
                     mr.get_mr()
                 )
             )
 
-        return Column.from_libcudf(move(c_result), stream, mr)
+        return Column.from_libcudf(move(c_result), _stream, mr)
     elif Patterns is list and Replacement is Column:
         c_patterns.reserve(len(patterns))
         for pattern in patterns:
@@ -97,12 +99,12 @@ cpdef Column replace_re(
                     c_patterns,
                     replacement.view(),
                     flags,
-                    stream.view(),
+                    _cs,
                     mr.get_mr()
                 )
             )
 
-        return Column.from_libcudf(move(c_result), stream, mr)
+        return Column.from_libcudf(move(c_result), _stream, mr)
     else:
         raise TypeError("Must pass either a RegexProgram and a Scalar or a list")
 
@@ -111,7 +113,7 @@ cpdef Column replace_with_backrefs(
     Column input,
     RegexProgram prog,
     str replacement,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -137,7 +139,8 @@ cpdef Column replace_with_backrefs(
         New strings column.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     cdef string c_replacement = replacement.encode()
 
@@ -146,8 +149,8 @@ cpdef Column replace_with_backrefs(
             input.view(),
             prog.c_obj.get()[0],
             c_replacement,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/reverse.pyi b/python/pylibcudf/pylibcudf/strings/reverse.pyi
index 182f4768825..48c602e2d28 100644
--- a/python/pylibcudf/pylibcudf/strings/reverse.pyi
+++ b/python/pylibcudf/pylibcudf/strings/reverse.pyi
@@ -1,13 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def reverse(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/reverse.pyx b/python/pylibcudf/pylibcudf/strings/reverse.pyx
index 49792b5661b..f1d06248523 100644
--- a/python/pylibcudf/pylibcudf/strings/reverse.pyx
+++ b/python/pylibcudf/pylibcudf/strings/reverse.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -9,10 +9,11 @@ from pylibcudf.libcudf.strings cimport reverse as cpp_reverse
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["reverse"]
 
-cpdef Column reverse(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column reverse(Column input, object stream=None, DeviceMemoryResource mr=None):
     """Reverses the characters within each string.
 
     Any null string entries return corresponding null output column entries.
@@ -32,9 +33,10 @@ cpdef Column reverse(Column input, Stream stream=None, DeviceMemoryResource mr=N
         New strings column
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
     with nogil:
-        c_result = cpp_reverse.reverse(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_reverse.reverse(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/slice.pxd b/python/pylibcudf/pylibcudf/strings/slice.pxd
index 6bb5a8d3611..9612ead3108 100644
--- a/python/pylibcudf/pylibcudf/strings/slice.pxd
+++ b/python/pylibcudf/pylibcudf/strings/slice.pxd
@@ -1,10 +1,9 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 ctypedef fused ColumnOrScalar:
     Column
@@ -15,6 +14,6 @@ cpdef Column slice_strings(
     ColumnOrScalar start=*,
     ColumnOrScalar stop=*,
     Scalar step=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyi b/python/pylibcudf/pylibcudf/strings/slice.pyi
index 73ee8c31b5b..ac2e4d12f1f 100644
--- a/python/pylibcudf/pylibcudf/strings/slice.pyi
+++ b/python/pylibcudf/pylibcudf/strings/slice.pyi
@@ -1,17 +1,17 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 def slice_strings(
     input: Column,
     start: Column | Scalar | None = None,
     stop: Column | Scalar | None = None,
     step: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/slice.pyx b/python/pylibcudf/pylibcudf/strings/slice.pyx
index 2b5bbf2f621..b3ac2cd8bfe 100644
--- a/python/pylibcudf/pylibcudf/strings/slice.pyx
+++ b/python/pylibcudf/pylibcudf/strings/slice.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -18,6 +18,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from ..utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["slice_strings"]
 
@@ -26,7 +27,7 @@ cpdef Column slice_strings(
     ColumnOrScalar start=None,
     ColumnOrScalar stop=None,
     Scalar step=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Perform a slice operation on a strings column.
@@ -60,7 +61,8 @@ cpdef Column slice_strings(
     cdef numeric_scalar[size_type]* cpp_start
     cdef numeric_scalar[size_type]* cpp_stop
     cdef numeric_scalar[size_type]* cpp_step
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if input is None:
@@ -80,22 +82,22 @@ cpdef Column slice_strings(
                 input.view(),
                 start.view(),
                 stop.view(),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
 
     elif ColumnOrScalar is Scalar:
         if start is None:
             start = Scalar.from_libcudf(
-                cpp_make_fixed_width_scalar(0, stream.view(), mr.get_mr())
+                cpp_make_fixed_width_scalar(0, _stream.view().value(), mr.get_mr())
             )
         if stop is None:
             stop = Scalar.from_libcudf(
-                cpp_make_fixed_width_scalar(0, stream.view(), mr.get_mr())
+                cpp_make_fixed_width_scalar(0, _stream.view().value(), mr.get_mr())
             )
         if step is None:
             step = Scalar.from_libcudf(
-                cpp_make_fixed_width_scalar(1, stream.view(), mr.get_mr())
+                cpp_make_fixed_width_scalar(1, _stream.view().value(), mr.get_mr())
             )
 
         cpp_start = <numeric_scalar[size_type]*>start.c_obj.get()
@@ -108,10 +110,10 @@ cpdef Column slice_strings(
                 dereference(cpp_start),
                 dereference(cpp_stop),
                 dereference(cpp_step),
-                stream.view(),
+                _cs,
                 mr.get_mr()
             )
     else:
         raise ValueError("start, stop, and step must be either Column or Scalar")
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/strings/split/partition.pxd
index d8001682b32..e3da533c90c 100644
--- a/python/pylibcudf/pylibcudf/strings/split/partition.pxd
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pxd
@@ -1,17 +1,16 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.scalar cimport Scalar
 from pylibcudf.table cimport Table
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Table partition(
-    Column input, Scalar delimiter=*, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, Scalar delimiter=*, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Table rpartition(
-    Column input, Scalar delimiter=*, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, Scalar delimiter=*, object stream = *, DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyi b/python/pylibcudf/pylibcudf/strings/split/partition.pyi
index d919b68153c..cef2d16aea6 100644
--- a/python/pylibcudf/pylibcudf/strings/split/partition.pyi
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyi
@@ -1,22 +1,22 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 def partition(
     input: Column,
     delimiter: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def rpartition(
     input: Column,
     delimiter: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/strings/split/partition.pyx b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
index 728d7b9975d..ce813c10bba 100644
--- a/python/pylibcudf/pylibcudf/strings/split/partition.pyx
+++ b/python/pylibcudf/pylibcudf/strings/split/partition.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -16,13 +16,14 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from cython.operator import dereference
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["partition", "rpartition"]
 
 cpdef Table partition(
     Column input,
     Scalar delimiter=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -46,12 +47,13 @@ cpdef Table partition(
     """
     cdef unique_ptr[table] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if delimiter is None:
         delimiter = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
 
     cdef const string_scalar* c_delimiter = <const string_scalar*>(
@@ -62,16 +64,16 @@ cpdef Table partition(
         c_result = cpp_partition.partition(
             input.view(),
             dereference(c_delimiter),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Table rpartition(
     Column input,
     Scalar delimiter=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -95,12 +97,13 @@ cpdef Table rpartition(
     """
     cdef unique_ptr[table] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if delimiter is None:
         delimiter = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
 
     cdef const string_scalar* c_delimiter = <const string_scalar*>(
@@ -111,8 +114,8 @@ cpdef Table rpartition(
         c_result = cpp_partition.rpartition(
             input.view(),
             dereference(c_delimiter),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/strings/split/split.pxd
index 06b77154b18..2372a177944 100644
--- a/python/pylibcudf/pylibcudf/strings/split/split.pxd
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pxd
@@ -7,50 +7,49 @@ from pylibcudf.scalar cimport Scalar
 from pylibcudf.strings.regex_program cimport RegexProgram
 from pylibcudf.table cimport Table
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Table split(
-    Column strings_column, Scalar delimiter, size_type maxsplit, Stream stream=*,
+    Column strings_column, Scalar delimiter, size_type maxsplit, object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Table rsplit(
-    Column strings_column, Scalar delimiter, size_type maxsplit, Stream stream=*,
+    Column strings_column, Scalar delimiter, size_type maxsplit, object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column split_record(
-    Column strings, Scalar delimiter, size_type maxsplit, Stream stream=*,
+    Column strings, Scalar delimiter, size_type maxsplit, object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column rsplit_record(
-    Column strings, Scalar delimiter, size_type maxsplit, Stream stream=*,
+    Column strings, Scalar delimiter, size_type maxsplit, object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Table split_re(
-    Column input, RegexProgram prog, size_type maxsplit, Stream stream=*,
+    Column input, RegexProgram prog, size_type maxsplit, object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Table rsplit_re(
-    Column input, RegexProgram prog, size_type maxsplit, Stream stream=*,
+    Column input, RegexProgram prog, size_type maxsplit, object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column split_record_re(
-    Column input, RegexProgram prog, size_type maxsplit, Stream stream=*,
+    Column input, RegexProgram prog, size_type maxsplit, object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column rsplit_record_re(
-    Column input, RegexProgram prog, size_type maxsplit, Stream stream=*,
+    Column input, RegexProgram prog, size_type maxsplit, object stream = *,
     DeviceMemoryResource mr=*,
 )
 
 cpdef Column split_part(
-    Column input, Scalar delimiter, size_type index, Stream stream=*,
+    Column input, Scalar delimiter, size_type index, object stream = *,
     DeviceMemoryResource mr=*,
 )
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyi b/python/pylibcudf/pylibcudf/strings/split/split.pyi
index ae64e300b63..7a775bd960c 100644
--- a/python/pylibcudf/pylibcudf/strings/split/split.pyi
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pyi
@@ -2,73 +2,73 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.strings.regex_program import RegexProgram
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 def split(
     strings_column: Column,
     delimiter: Scalar,
     maxsplit: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def rsplit(
     strings_column: Column,
     delimiter: Scalar,
     maxsplit: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def split_record(
     strings: Column,
     delimiter: Scalar,
     maxsplit: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def rsplit_record(
     strings: Column,
     delimiter: Scalar,
     maxsplit: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def split_re(
     input: Column,
     prog: RegexProgram,
     maxsplit: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def rsplit_re(
     input: Column,
     prog: RegexProgram,
     maxsplit: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
 def split_record_re(
     input: Column,
     prog: RegexProgram,
     maxsplit: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def rsplit_record_re(
     input: Column,
     prog: RegexProgram,
     maxsplit: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def split_part(
     input: Column,
     delimiter: Scalar,
     index: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/split/split.pyx b/python/pylibcudf/pylibcudf/strings/split/split.pyx
index 0635df87e13..52803b08eb0 100644
--- a/python/pylibcudf/pylibcudf/strings/split/split.pyx
+++ b/python/pylibcudf/pylibcudf/strings/split/split.pyx
@@ -16,6 +16,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from cython.operator import dereference
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "rsplit",
@@ -32,7 +33,7 @@ cpdef Table split(
     Column strings_column,
     Scalar delimiter,
     size_type maxsplit,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -65,7 +66,8 @@ cpdef Table split(
     cdef const string_scalar* c_delimiter = <const string_scalar*>(
         delimiter.c_obj.get()
     )
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -73,18 +75,18 @@ cpdef Table split(
             strings_column.view(),
             dereference(c_delimiter),
             maxsplit,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table rsplit(
     Column strings_column,
     Scalar delimiter,
     size_type maxsplit,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -117,7 +119,8 @@ cpdef Table rsplit(
     cdef const string_scalar* c_delimiter = <const string_scalar*>(
         delimiter.c_obj.get()
     )
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -125,17 +128,17 @@ cpdef Table rsplit(
             strings_column.view(),
             dereference(c_delimiter),
             maxsplit,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column split_record(
     Column strings,
     Scalar delimiter,
     size_type maxsplit,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -164,7 +167,8 @@ cpdef Column split_record(
     cdef const string_scalar* c_delimiter = <const string_scalar*>(
         delimiter.c_obj.get()
     )
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -172,18 +176,18 @@ cpdef Column split_record(
             strings.view(),
             dereference(c_delimiter),
             maxsplit,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column rsplit_record(
     Column strings,
     Scalar delimiter,
     size_type maxsplit,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -213,7 +217,8 @@ cpdef Column rsplit_record(
     cdef const string_scalar* c_delimiter = <const string_scalar*>(
         delimiter.c_obj.get()
     )
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -221,18 +226,18 @@ cpdef Column rsplit_record(
             strings.view(),
             dereference(c_delimiter),
             maxsplit,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Table split_re(
     Column input,
     RegexProgram prog,
     size_type maxsplit,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -259,7 +264,8 @@ cpdef Table split_re(
         A table of columns of strings.
     """
     cdef unique_ptr[table] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -267,17 +273,17 @@ cpdef Table split_re(
             input.view(),
             prog.c_obj.get()[0],
             maxsplit,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Table rsplit_re(
     Column input,
     RegexProgram prog,
     size_type maxsplit,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -305,7 +311,8 @@ cpdef Table rsplit_re(
         A table of columns of strings.
     """
     cdef unique_ptr[table] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -313,17 +320,17 @@ cpdef Table rsplit_re(
             input.view(),
             prog.c_obj.get()[0],
             maxsplit,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Table.from_libcudf(move(c_result), stream, mr)
+    return Table.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column split_record_re(
     Column input,
     RegexProgram prog,
     size_type maxsplit,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -350,7 +357,8 @@ cpdef Column split_record_re(
         Lists column of strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -358,14 +366,14 @@ cpdef Column split_record_re(
             input.view(),
             prog.c_obj.get()[0],
             maxsplit,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef Column rsplit_record_re(
-    Column input, RegexProgram prog, size_type maxsplit, Stream stream=None,
+    Column input, RegexProgram prog, size_type maxsplit, object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -392,7 +400,8 @@ cpdef Column rsplit_record_re(
         Lists column of strings.
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -400,22 +409,23 @@ cpdef Column rsplit_record_re(
             input.view(),
             prog.c_obj.get()[0],
             maxsplit,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column split_part(
-    Column input, Scalar delimiter, size_type index, Stream stream=None,
+    Column input, Scalar delimiter, size_type index, object stream=None,
     DeviceMemoryResource mr=None,
 ):
     cdef unique_ptr[column] c_result
     cdef const string_scalar* c_delimiter = <const string_scalar*>(
         delimiter.c_obj.get()
     )
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -423,8 +433,8 @@ cpdef Column split_part(
             input.view(),
             dereference(c_delimiter),
             index,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/strings/strip.pxd
index d3f41ce9a5c..a37ac40c523 100644
--- a/python/pylibcudf/pylibcudf/strings/strip.pxd
+++ b/python/pylibcudf/pylibcudf/strings/strip.pxd
@@ -1,17 +1,16 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.scalar cimport Scalar
 from pylibcudf.strings.side_type cimport side_type
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column strip(
     Column input,
     side_type side=*,
     Scalar to_strip=*,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyi b/python/pylibcudf/pylibcudf/strings/strip.pyi
index ecb80b632d7..786079769c7 100644
--- a/python/pylibcudf/pylibcudf/strings/strip.pyi
+++ b/python/pylibcudf/pylibcudf/strings/strip.pyi
@@ -1,17 +1,17 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
 from pylibcudf.strings.side_type import SideType
+from pylibcudf.utils import CudaStreamLike
 
 def strip(
     input: Column,
     side: SideType = SideType.BOTH,
     to_strip: Scalar | None = None,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/strip.pyx b/python/pylibcudf/pylibcudf/strings/strip.pyx
index 3b477fa83ad..607428b6f69 100644
--- a/python/pylibcudf/pylibcudf/strings/strip.pyx
+++ b/python/pylibcudf/pylibcudf/strings/strip.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from cython.operator cimport dereference
@@ -16,6 +16,7 @@ from pylibcudf.strings.side_type cimport side_type
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["strip"]
 
@@ -23,7 +24,7 @@ cpdef Column strip(
     Column input,
     side_type side=side_type.BOTH,
     Scalar to_strip=None,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Removes the specified characters from the beginning
@@ -47,12 +48,13 @@ cpdef Column strip(
     pylibcudf.Column
         New strings column.
     """
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     if to_strip is None:
         to_strip = Scalar.from_libcudf(
-            cpp_make_string_scalar("".encode(), stream.view(), mr.get_mr())
+            cpp_make_string_scalar("".encode(), _stream.view().value(), mr.get_mr())
         )
 
     cdef unique_ptr[column] c_result
@@ -64,8 +66,8 @@ cpdef Column strip(
             input.view(),
             side,
             dereference(cpp_to_strip),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/strings/translate.pxd
index 2d74e2f4a2c..d6a80ddfd43 100644
--- a/python/pylibcudf/pylibcudf/strings/translate.pxd
+++ b/python/pylibcudf/pylibcudf/strings/translate.pxd
@@ -1,14 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.strings.translate cimport filter_type
 from pylibcudf.scalar cimport Scalar
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column translate(
-    Column input, dict chars_table, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, dict chars_table, object stream = *, DeviceMemoryResource mr=*
 )
 
 cpdef Column filter_characters(
@@ -16,6 +15,6 @@ cpdef Column filter_characters(
     dict characters_to_filter,
     filter_type keep_characters,
     Scalar replacement,
-    Stream stream=*,
+    object stream = *,
     DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyi b/python/pylibcudf/pylibcudf/strings/translate.pyi
index a01b786fd6f..9e7624e0b17 100644
--- a/python/pylibcudf/pylibcudf/strings/translate.pyi
+++ b/python/pylibcudf/pylibcudf/strings/translate.pyi
@@ -1,13 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from collections.abc import Mapping
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.scalar import Scalar
+from pylibcudf.utils import CudaStreamLike
 
 class FilterType(IntEnum):
     KEEP = ...
@@ -16,7 +16,7 @@ class FilterType(IntEnum):
 def translate(
     input: Column,
     chars_table: Mapping[int | str, int | str],
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def filter_characters(
@@ -24,6 +24,6 @@ def filter_characters(
     characters_to_filter: Mapping[int | str, int | str],
     keep_characters: FilterType,
     replacement: Scalar,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/translate.pyx b/python/pylibcudf/pylibcudf/strings/translate.pyx
index 06c772330df..2a60ff881d4 100644
--- a/python/pylibcudf/pylibcudf/strings/translate.pyx
+++ b/python/pylibcudf/pylibcudf/strings/translate.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
@@ -15,6 +15,7 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
 
 from cython.operator import dereference
+from cuda.bindings.cyruntime cimport cudaStream_t
 from pylibcudf.libcudf.strings.translate import \
     filter_type as FilterType  # no-cython-lint
 
@@ -43,7 +44,7 @@ cdef vector[pair[char_utf8, char_utf8]] _table_to_c_table(dict table):
 
 
 cpdef Column translate(
-    Column input, dict chars_table, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, dict chars_table, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Translates individual characters within each string.
@@ -69,17 +70,18 @@ cpdef Column translate(
     cdef vector[pair[char_utf8, char_utf8]] c_chars_table = _table_to_c_table(
         chars_table
     )
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_translate.translate(
             input.view(),
             c_chars_table,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column filter_characters(
@@ -87,7 +89,7 @@ cpdef Column filter_characters(
     dict characters_to_filter,
     filter_type keep_characters,
     Scalar replacement,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """
@@ -124,7 +126,8 @@ cpdef Column filter_characters(
     cdef const string_scalar* c_replacement = <const string_scalar*>(
         replacement.c_obj.get()
     )
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -133,9 +136,9 @@ cpdef Column filter_characters(
             c_characters_to_filter,
             keep_characters,
             dereference(c_replacement),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 FilterType.__str__ = FilterType.__repr__
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/strings/wrap.pxd
index 62faaff36f0..ea74927498d 100644
--- a/python/pylibcudf/pylibcudf/strings/wrap.pxd
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pxd
@@ -1,12 +1,11 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from pylibcudf.column cimport Column
 from pylibcudf.libcudf.types cimport size_type
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
-from rmm.pylibrmm.stream cimport Stream
 
 
 cpdef Column wrap(
-    Column input, size_type width, Stream stream=*, DeviceMemoryResource mr=*
+    Column input, size_type width, object stream = *, DeviceMemoryResource mr=*
 )
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyi b/python/pylibcudf/pylibcudf/strings/wrap.pyi
index 00c939cc420..aa88b64a391 100644
--- a/python/pylibcudf/pylibcudf/strings/wrap.pyi
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pyi
@@ -1,14 +1,14 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
+from pylibcudf.utils import CudaStreamLike
 
 def wrap(
     input: Column,
     width: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
diff --git a/python/pylibcudf/pylibcudf/strings/wrap.pyx b/python/pylibcudf/pylibcudf/strings/wrap.pyx
index 504c469debc..28bc310b5a4 100644
--- a/python/pylibcudf/pylibcudf/strings/wrap.pyx
+++ b/python/pylibcudf/pylibcudf/strings/wrap.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.memory cimport unique_ptr
@@ -10,11 +10,12 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.utils cimport _get_stream, _get_memory_resource
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from rmm.pylibrmm.stream cimport Stream
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["wrap"]
 
 cpdef Column wrap(
-    Column input, size_type width, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, size_type width, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Wraps strings onto multiple lines shorter than `width` by
@@ -41,15 +42,16 @@ cpdef Column wrap(
         Column of wrapped strings
     """
     cdef unique_ptr[column] c_result
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_wrap.wrap(
             input.view(),
             width,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
diff --git a/python/pylibcudf/pylibcudf/table.pxd b/python/pylibcudf/pylibcudf/table.pxd
index 4a4a963e0de..76c38dacf3f 100644
--- a/python/pylibcudf/pylibcudf/table.pxd
+++ b/python/pylibcudf/pylibcudf/table.pxd
@@ -4,7 +4,6 @@
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.table.table_view cimport table_view
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 cdef class Table:
@@ -20,7 +19,7 @@ cdef class Table:
     @staticmethod
     cdef Table from_libcudf(
         unique_ptr[table] libcudf_tbl,
-        Stream stream,
+        object stream,
         DeviceMemoryResource mr
     )
 
@@ -31,8 +30,8 @@ cdef class Table:
     cdef Table from_table_view_of_arbitrary(
         const table_view& tv,
         object owner,
-        Stream stream,
+        object stream,
     )
 
     cpdef list columns(self)
-    cpdef Table copy(self, Stream stream=*, DeviceMemoryResource mr=*)
+    cpdef Table copy(self, object stream = *, DeviceMemoryResource mr=*)
diff --git a/python/pylibcudf/pylibcudf/table.pyi b/python/pylibcudf/pylibcudf/table.pyi
index 0f8de52b132..263bf813c75 100644
--- a/python/pylibcudf/pylibcudf/table.pyi
+++ b/python/pylibcudf/pylibcudf/table.pyi
@@ -4,11 +4,11 @@
 from typing import Any
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf._interop_helpers import ArrowLike, ColumnMetadata
 from pylibcudf.column import Column
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 class Table:
     def __init__(self, column: list[Column]): ...
@@ -18,22 +18,22 @@ class Table:
     def columns(self) -> list[Column]: ...
     def copy(
         self,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> Table: ...
     def to_arrow(
         self,
         metadata: list[ColumnMetadata | str] | None = None,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
     ) -> ArrowLike: ...
     # Private methods below are included because polars is currently using them,
     # but we want to remove stubs for these private methods eventually
     def _to_schema(self, metadata: Any = None) -> Any: ...
-    def _to_host_array(self, stream: Stream) -> Any: ...
+    def _to_host_array(self, stream: CudaStreamLike) -> Any: ...
     @staticmethod
     def from_arrow(
         arrow_like: ArrowLike,
         dtype: DataType | None = None,
-        stream: Stream | None = None,
+        stream: CudaStreamLike | None = None,
         mr: DeviceMemoryResource | None = None,
     ) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx
index 654cf9bb60b..6b62a5428f9 100644
--- a/python/pylibcudf/pylibcudf/table.pyx
+++ b/python/pylibcudf/pylibcudf/table.pyx
@@ -39,6 +39,7 @@ from pylibcudf._interop_helpers cimport (
     _metadata_to_libcudf,
 )
 from ._interop_helpers import ArrowLike, ColumnMetadata, _ObjectWithArrowMetadata
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 try:
     import pyarrow as pa
@@ -105,7 +106,7 @@ cdef class Table:
     def from_arrow(
         obj: ArrowLike,
         dtype: DataType | None = None,
-        Stream stream=None,
+        object stream=None,
         DeviceMemoryResource mr=None
     ) -> Table:
         """
@@ -154,7 +155,8 @@ cdef class Table:
         cdef _ArrowTableHolder result
         cdef unique_ptr[arrow_table] c_result
 
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
         mr = _get_memory_resource(mr)
 
         if hasattr(obj, "__arrow_c_device_array__"):
@@ -170,7 +172,7 @@ cdef class Table:
                 c_result = make_unique[arrow_table](
                     move(dereference(c_schema)),
                     move(dereference(c_array)),
-                    stream.view(),
+                    _cs,
                     result.mr.get_mr(),
                 )
             result.tbl.swap(c_result)
@@ -193,7 +195,7 @@ cdef class Table:
             with nogil:
                 c_result = make_unique[arrow_table](
                     move(dereference(c_stream)),
-                    stream.view(),
+                    _cs,
                     result.mr.get_mr(),
                 )
             result.tbl.swap(c_result)
@@ -233,7 +235,7 @@ cdef class Table:
     @staticmethod
     cdef Table from_libcudf(
         unique_ptr[table] libcudf_tbl,
-        Stream stream,
+        object stream,
         DeviceMemoryResource mr
     ):
         """Create a Table from a libcudf table.
@@ -275,7 +277,7 @@ cdef class Table:
     cdef Table from_table_view_of_arbitrary(
         const table_view& tv,
         object owner,
-        Stream stream,
+        object stream,
     ):
         """Create a Table from a libcudf table_view into an arbitrary owner.
 
@@ -292,8 +294,9 @@ cdef class Table:
         # For efficiency, prohibit calling this overload with a Table owner.
         assert not isinstance(owner, Table)
         cdef int i
+        cdef Stream _stream = <Stream>stream
         return Table([
-            Column.from_column_view_of_arbitrary(tv.column(i), owner, stream)
+            Column.from_column_view_of_arbitrary(tv.column(i), owner, _stream)
             for i in range(tv.num_columns())
         ])
 
@@ -315,7 +318,7 @@ cdef class Table:
         """The shape of this table"""
         return (self.num_rows(), self.num_columns())
 
-    cpdef Table copy(self, Stream stream=None, DeviceMemoryResource mr=None):
+    cpdef Table copy(self, object stream=None, DeviceMemoryResource mr=None):
         """Create a deep copy of the table.
 
         Parameters
@@ -330,9 +333,9 @@ cdef class Table:
         Table
             A new Table with deep copies of all columns.
         """
-        stream = _get_stream(stream)
+        cdef Stream _stream = _get_stream(stream)
         mr = _get_memory_resource(mr)
-        return Table([col.copy(stream, mr) for col in self._columns])
+        return Table([col.copy(_stream, mr) for col in self._columns])
 
     def _to_schema(self, metadata=None):
         """Create an Arrow schema from this table."""
@@ -356,11 +359,13 @@ cdef class Table:
 
         return PyCapsule_New(<void*>raw_schema_ptr, "arrow_schema", _release_schema)
 
-    def _to_host_array(self, Stream stream):
+    def _to_host_array(self, object stream):
         cdef ArrowArray* raw_host_array_ptr
+        cdef Stream _stream = _get_stream(stream)
+        cdef cudaStream_t _cs = _stream.view().value()
 
         with nogil:
-            raw_host_array_ptr = to_arrow_host_raw(self.view(), stream.view())
+            raw_host_array_ptr = to_arrow_host_raw(self.view(), _cs)
 
         return PyCapsule_New(<void*>raw_host_array_ptr, "arrow_array", _release_array)
 
diff --git a/python/pylibcudf/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd
index a92ffb3f27e..8333abd6df0 100644
--- a/python/pylibcudf/pylibcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/transform.pxd
@@ -3,7 +3,6 @@
 from libcpp cimport bool
 from pylibcudf.libcudf.types cimport bitmask_type, data_type
 from pylibcudf.libcudf.types cimport null_aware, output_nullability
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .column cimport Column
@@ -14,30 +13,30 @@ from .types cimport DataType
 
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(
-    Column input, Stream stream = *, DeviceMemoryResource mr = *
+    Column input, object stream = *, DeviceMemoryResource mr = *
 )
 
 cpdef Column column_nans_to_nulls(
-    Column input, Stream stream = *, DeviceMemoryResource mr = *
+    Column input, object stream = *, DeviceMemoryResource mr = *
 )
 
 cpdef Column compute_column(
-    Table input, Expression expr, Stream stream = *, DeviceMemoryResource mr = *
+    Table input, Expression expr, object stream = *, DeviceMemoryResource mr = *
 )
 
 cpdef Column compute_column_jit(
-    Table input, Expression expr, Stream stream = *, DeviceMemoryResource mr = *
+    Table input, Expression expr, object stream = *, DeviceMemoryResource mr = *
 )
 
 cpdef tuple[gpumemoryview, int] bools_to_mask(
-    Column input, Stream stream = *, DeviceMemoryResource mr = *
+    Column input, object stream = *, DeviceMemoryResource mr = *
 )
 
 cpdef Column mask_to_bools(
     Py_ssize_t bitmask,
     int begin_bit,
     int end_bit,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
@@ -48,17 +47,17 @@ cpdef Column transform(
     bool is_ptx,
     null_aware is_null_aware,
     output_nullability null_policy,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
 
 cpdef tuple[Table, Column] encode(
-    Table input, Stream stream = *, DeviceMemoryResource mr = *
+    Table input, object stream = *, DeviceMemoryResource mr = *
 )
 
 cpdef Table one_hot_encode(
     Column input_column,
     Column categories,
-    Stream stream = *,
+    object stream = *,
     DeviceMemoryResource mr = *,
 )
diff --git a/python/pylibcudf/pylibcudf/transform.pyi b/python/pylibcudf/pylibcudf/transform.pyi
index 2d2038f07a0..e979575f590 100644
--- a/python/pylibcudf/pylibcudf/transform.pyi
+++ b/python/pylibcudf/pylibcudf/transform.pyi
@@ -1,46 +1,46 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.expressions import Expression
 from pylibcudf.gpumemoryview import gpumemoryview
 from pylibcudf.table import Table
 from pylibcudf.types import DataType, NullAware, OutputNullability
+from pylibcudf.utils import CudaStreamLike
 
 def nans_to_nulls(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[gpumemoryview, int]: ...
 def column_nans_to_nulls(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def compute_column(
     input: Table,
     expr: Expression,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def compute_column_jit(
     input: Table,
     expr: Expression,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def bools_to_mask(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[gpumemoryview, int]: ...
 def mask_to_bools(
     bitmask: int,
     begin_bit: int,
     end_bit: int,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def transform(
@@ -50,17 +50,17 @@ def transform(
     is_ptx: bool,
     null_aware: NullAware = NullAware.NO,
     null_policy: OutputNullability = OutputNullability.PRESERVE,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def encode(
     input: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> tuple[Table, Column]: ...
 def one_hot_encode(
     input: Column,
     categories: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
index 3baf6c5306e..0025ed7d566 100644
--- a/python/pylibcudf/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -26,6 +26,7 @@ from .expressions cimport Expression
 from .gpumemoryview cimport gpumemoryview
 from .types cimport DataType, null_aware, output_nullability
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "bools_to_mask",
@@ -41,7 +42,7 @@ __all__ = [
 
 cpdef tuple[gpumemoryview, int] nans_to_nulls(
     Column input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a null mask preserving existing nulls and converting nans to null.
@@ -63,21 +64,26 @@ cpdef tuple[gpumemoryview, int] nans_to_nulls(
     """
     cdef pair[unique_ptr[device_buffer], size_type] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_transform.nans_to_nulls(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_transform.nans_to_nulls(
+            input.view(), _cs, mr.get_mr()
+        )
 
     return (
-        gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first), stream, mr)),
+        gpumemoryview(
+            DeviceBuffer.c_from_unique_ptr(move(c_result.first), _stream, mr)
+        ),
         c_result.second
     )
 
 
 cpdef Column column_nans_to_nulls(
     Column input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a column with nans converted to nulls.
@@ -100,19 +106,20 @@ cpdef Column column_nans_to_nulls(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_transform.column_nans_to_nulls(
-            input.view(), stream.view(), mr.get_mr()
+            input.view(), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column compute_column(
-    Table input, Expression expr, Stream stream=None, DeviceMemoryResource mr=None
+    Table input, Expression expr, object stream=None, DeviceMemoryResource mr=None
 ):
     """Create a column by evaluating an expression on a table.
 
@@ -135,19 +142,20 @@ cpdef Column compute_column(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_transform.compute_column(
-            input.view(), dereference(expr.c_obj.get()), stream.view(), mr.get_mr()
+            input.view(), dereference(expr.c_obj.get()), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column compute_column_jit(
-    Table input, Expression expr, Stream stream=None, DeviceMemoryResource mr=None
+    Table input, Expression expr, object stream=None, DeviceMemoryResource mr=None
 ):
     """
     Create a column by evaluating an expression on a table
@@ -172,20 +180,21 @@ cpdef Column compute_column_jit(
     """
     cdef unique_ptr[column] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_transform.compute_column_jit(
-            input.view(), dereference(expr.c_obj.get()), stream.view(), mr.get_mr()
+            input.view(), dereference(expr.c_obj.get()), _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef tuple[gpumemoryview, int] bools_to_mask(
     Column input,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a bitmask from a column of boolean elements
@@ -206,14 +215,19 @@ cpdef tuple[gpumemoryview, int] bools_to_mask(
     """
     cdef pair[unique_ptr[device_buffer], size_type] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_transform.bools_to_mask(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_transform.bools_to_mask(
+            input.view(), _cs, mr.get_mr()
+        )
 
     return (
-        gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first), stream, mr)),
+        gpumemoryview(
+            DeviceBuffer.c_from_unique_ptr(move(c_result.first), _stream, mr)
+        ),
         c_result.second
     )
 
@@ -222,7 +236,7 @@ cpdef Column mask_to_bools(
     Py_ssize_t bitmask,
     int begin_bit,
     int end_bit,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Creates a boolean column from given bitmask.
@@ -248,7 +262,8 @@ cpdef Column mask_to_bools(
     cdef unique_ptr[column] c_result
     cdef bitmask_type * bitmask_ptr = <bitmask_type*>bitmask
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
@@ -256,11 +271,11 @@ cpdef Column mask_to_bools(
             bitmask_ptr,
             begin_bit,
             end_bit,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 
 cpdef Column transform(
@@ -270,7 +285,7 @@ cpdef Column transform(
     bool is_ptx,
     null_aware is_null_aware,
     output_nullability null_policy,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Create a new column by applying a transform function against
@@ -312,7 +327,8 @@ cpdef Column transform(
     cdef output_nullability c_null_policy = null_policy
     cdef optional[void *] user_data
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     for input in inputs:
@@ -327,14 +343,14 @@ cpdef Column transform(
             user_data,
             c_is_null_aware,
             c_null_policy,
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
-    return Column.from_libcudf(move(c_result), stream, mr)
+    return Column.from_libcudf(move(c_result), _stream, mr)
 
 cpdef tuple[Table, Column] encode(
-    Table input, Stream stream=None, DeviceMemoryResource mr=None
+    Table input, object stream=None, DeviceMemoryResource mr=None
 ):
     """Encode the rows of the given table as integers.
 
@@ -355,21 +371,22 @@ cpdef tuple[Table, Column] encode(
     """
     cdef pair[unique_ptr[table], unique_ptr[column]] c_result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        c_result = cpp_transform.encode(input.view(), stream.view(), mr.get_mr())
+        c_result = cpp_transform.encode(input.view(), _cs, mr.get_mr())
 
     return (
-        Table.from_libcudf(move(c_result.first), stream, mr),
-        Column.from_libcudf(move(c_result.second), stream, mr)
+        Table.from_libcudf(move(c_result.first), _stream, mr),
+        Column.from_libcudf(move(c_result.second), _stream, mr)
     )
 
 cpdef Table one_hot_encode(
     Column input,
     Column categories,
-    Stream stream=None,
+    object stream=None,
     DeviceMemoryResource mr=None,
 ):
     """Encodes `input` by generating a new column
@@ -395,19 +412,20 @@ cpdef Table one_hot_encode(
     cdef pair[unique_ptr[column], table_view] c_result
     cdef Table owner_table
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_transform.one_hot_encode(
             input.view(),
             categories.view(),
-            stream.view(),
+            _cs,
             mr.get_mr()
         )
 
     owner_table = Table(
-        [Column.from_libcudf(move(c_result.first), stream, mr)]
+        [Column.from_libcudf(move(c_result.first), _stream, mr)]
         * c_result.second.num_columns()
     )
 
diff --git a/python/pylibcudf/pylibcudf/transpose.pxd b/python/pylibcudf/pylibcudf/transpose.pxd
index 6c432a62b5f..a63d52da9e1 100644
--- a/python/pylibcudf/pylibcudf/transpose.pxd
+++ b/python/pylibcudf/pylibcudf/transpose.pxd
@@ -1,9 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from .table cimport Table
 
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 
-cpdef Table transpose(Table input_table, Stream stream=*, DeviceMemoryResource mr=*)
+cpdef Table transpose(Table input_table, object stream = *, DeviceMemoryResource mr=*)
diff --git a/python/pylibcudf/pylibcudf/transpose.pyi b/python/pylibcudf/pylibcudf/transpose.pyi
index 4487e49feaf..fbf2d3fce2d 100644
--- a/python/pylibcudf/pylibcudf/transpose.pyi
+++ b/python/pylibcudf/pylibcudf/transpose.pyi
@@ -1,13 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.table import Table
+from pylibcudf.utils import CudaStreamLike
 
 def transpose(
     input_table: Table,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx
index e7cdbe503eb..e15aa45ce77 100644
--- a/python/pylibcudf/pylibcudf/transpose.pyx
+++ b/python/pylibcudf/pylibcudf/transpose.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
@@ -13,11 +13,12 @@ from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 from .column cimport Column
 from .table cimport Table
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = ["transpose"]
 
 cpdef Table transpose(
-    Table input_table, Stream stream=None, DeviceMemoryResource mr=None
+    Table input_table, object stream=None, DeviceMemoryResource mr=None
 ):
     """Transpose a Table.
 
@@ -39,16 +40,17 @@ cpdef Table transpose(
     """
     cdef pair[unique_ptr[column], table_view] c_result
     cdef Table owner_table
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         c_result = cpp_transpose.transpose(
-            input_table.view(), stream.view(), mr.get_mr()
+            input_table.view(), _cs, mr.get_mr()
         )
 
     owner_table = Table(
-        [Column.from_libcudf(move(c_result.first), stream, mr)] *
+        [Column.from_libcudf(move(c_result.first), _stream, mr)] *
         c_result.second.num_columns()
     )
 
diff --git a/python/pylibcudf/pylibcudf/unary.pxd b/python/pylibcudf/pylibcudf/unary.pxd
index 69ec06ecea6..44a4f796085 100644
--- a/python/pylibcudf/pylibcudf/unary.pxd
+++ b/python/pylibcudf/pylibcudf/unary.pxd
@@ -1,9 +1,8 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
 from pylibcudf.libcudf.unary cimport unary_operator
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
 
 from .column cimport Column
@@ -11,19 +10,19 @@ from .types cimport DataType
 
 
 cpdef Column unary_operation(
-    Column input, unary_operator op, Stream stream = *, DeviceMemoryResource mr = *
+    Column input, unary_operator op, object stream = *, DeviceMemoryResource mr = *
 )
 
-cpdef Column is_null(Column input, Stream stream = *, DeviceMemoryResource mr = *)
+cpdef Column is_null(Column input, object stream = *, DeviceMemoryResource mr = *)
 
-cpdef Column is_valid(Column input, Stream stream = *, DeviceMemoryResource mr = *)
+cpdef Column is_valid(Column input, object stream = *, DeviceMemoryResource mr = *)
 
 cpdef Column cast(
-    Column input, DataType data_type, Stream stream = *, DeviceMemoryResource mr = *
+    Column input, DataType data_type, object stream = *, DeviceMemoryResource mr = *
 )
 
-cpdef Column is_nan(Column input, Stream stream = *, DeviceMemoryResource mr = *)
+cpdef Column is_nan(Column input, object stream = *, DeviceMemoryResource mr = *)
 
-cpdef Column is_not_nan(Column input, Stream stream = *, DeviceMemoryResource mr = *)
+cpdef Column is_not_nan(Column input, object stream = *, DeviceMemoryResource mr = *)
 
 cpdef bool is_supported_cast(DataType from_, DataType to)
diff --git a/python/pylibcudf/pylibcudf/unary.pyi b/python/pylibcudf/pylibcudf/unary.pyi
index 6a77f7998b9..dd3d42404e7 100644
--- a/python/pylibcudf/pylibcudf/unary.pyi
+++ b/python/pylibcudf/pylibcudf/unary.pyi
@@ -1,13 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from enum import IntEnum
 
 from rmm.pylibrmm.memory_resource import DeviceMemoryResource
-from rmm.pylibrmm.stream import Stream
 
 from pylibcudf.column import Column
 from pylibcudf.types import DataType
+from pylibcudf.utils import CudaStreamLike
 
 class UnaryOperator(IntEnum):
     SIN = ...
@@ -38,33 +38,33 @@ class UnaryOperator(IntEnum):
 def unary_operation(
     input: Column,
     op: UnaryOperator,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_null(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_valid(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def cast(
     input: Column,
     data_type: DataType,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_nan(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_not_nan(
     input: Column,
-    stream: Stream | None = None,
+    stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
 def is_supported_cast(from_: DataType, to: DataType) -> bool: ...
diff --git a/python/pylibcudf/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx
index da5b08df685..e0614037012 100644
--- a/python/pylibcudf/pylibcudf/unary.pyx
+++ b/python/pylibcudf/pylibcudf/unary.pyx
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp cimport bool
@@ -16,6 +16,7 @@ from pylibcudf.libcudf.unary import \
 from .column cimport Column
 from .types cimport DataType
 from .utils cimport _get_stream, _get_memory_resource
+from cuda.bindings.cyruntime cimport cudaStream_t
 
 __all__ = [
     "UnaryOperator",
@@ -29,7 +30,7 @@ __all__ = [
 ]
 
 cpdef Column unary_operation(
-    Column input, unary_operator op, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, unary_operator op, object stream=None, DeviceMemoryResource mr=None
 ):
     """Perform a unary operation on a column.
 
@@ -53,16 +54,19 @@ cpdef Column unary_operation(
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_unary.unary_operation(input.view(), op, stream.view(), mr.get_mr())
+        result = cpp_unary.unary_operation(
+            input.view(), op, _cs, mr.get_mr()
+        )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
-cpdef Column is_null(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column is_null(Column input, object stream=None, DeviceMemoryResource mr=None):
     """Check whether elements of a column are null.
 
     For details, see :cpp:func:`is_null`.
@@ -83,16 +87,17 @@ cpdef Column is_null(Column input, Stream stream=None, DeviceMemoryResource mr=N
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_unary.is_null(input.view(), stream.view(), mr.get_mr())
+        result = cpp_unary.is_null(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
-cpdef Column is_valid(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column is_valid(Column input, object stream=None, DeviceMemoryResource mr=None):
     """Check whether elements of a column are valid.
 
     For details, see :cpp:func:`is_valid`.
@@ -113,17 +118,18 @@ cpdef Column is_valid(Column input, Stream stream=None, DeviceMemoryResource mr=
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_unary.is_valid(input.view(), stream.view(), mr.get_mr())
+        result = cpp_unary.is_valid(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
 cpdef Column cast(
-    Column input, DataType data_type, Stream stream=None, DeviceMemoryResource mr=None
+    Column input, DataType data_type, object stream=None, DeviceMemoryResource mr=None
 ):
     """Cast a column to a different data type.
 
@@ -147,18 +153,19 @@ cpdef Column cast(
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
         result = cpp_unary.cast(
-            input.view(), data_type.c_obj, stream.view(), mr.get_mr()
+            input.view(), data_type.c_obj, _cs, mr.get_mr()
         )
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
-cpdef Column is_nan(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column is_nan(Column input, object stream=None, DeviceMemoryResource mr=None):
     """Check whether elements of a column are nan.
 
     For details, see :cpp:func:`is_nan`.
@@ -179,16 +186,17 @@ cpdef Column is_nan(Column input, Stream stream=None, DeviceMemoryResource mr=No
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_unary.is_nan(input.view(), stream.view(), mr.get_mr())
+        result = cpp_unary.is_nan(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 
-cpdef Column is_not_nan(Column input, Stream stream=None, DeviceMemoryResource mr=None):
+cpdef Column is_not_nan(Column input, object stream=None, DeviceMemoryResource mr=None):
     """Check whether elements of a column are not nan.
 
     For details, see :cpp:func:`is_not_nan`.
@@ -209,13 +217,14 @@ cpdef Column is_not_nan(Column input, Stream stream=None, DeviceMemoryResource m
     """
     cdef unique_ptr[column] result
 
-    stream = _get_stream(stream)
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
     mr = _get_memory_resource(mr)
 
     with nogil:
-        result = cpp_unary.is_not_nan(input.view(), stream.view(), mr.get_mr())
+        result = cpp_unary.is_not_nan(input.view(), _cs, mr.get_mr())
 
-    return Column.from_libcudf(move(result), stream, mr)
+    return Column.from_libcudf(move(result), _stream, mr)
 
 cpdef bool is_supported_cast(DataType from_, DataType to):
     """Check if a cast between datatypes is supported.
diff --git a/python/pylibcudf/pylibcudf/utils.pxd b/python/pylibcudf/pylibcudf/utils.pxd
index b3d2928f398..feb82cea18f 100644
--- a/python/pylibcudf/pylibcudf/utils.pxd
+++ b/python/pylibcudf/pylibcudf/utils.pxd
@@ -1,12 +1,12 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
 from pylibcudf.libcudf.scalar.scalar cimport scalar
-from rmm.pylibrmm.stream cimport Stream
 from rmm.pylibrmm.memory_resource cimport DeviceMemoryResource
+from rmm.pylibrmm.stream cimport Stream
 
 cdef vector[reference_wrapper[const scalar]] _as_vector(list source)
-cpdef Stream _get_stream(Stream stream = *)
+cpdef Stream _get_stream(object stream = *)
 cdef DeviceMemoryResource _get_memory_resource(DeviceMemoryResource mr = *)
diff --git a/python/pylibcudf/pylibcudf/utils.pyi b/python/pylibcudf/pylibcudf/utils.pyi
index 21f669898ba..cc3cb93e6c0 100644
--- a/python/pylibcudf/pylibcudf/utils.pyi
+++ b/python/pylibcudf/pylibcudf/utils.pyi
@@ -1,6 +1,13 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 
+from typing import Protocol
+
 from rmm.pylibrmm.stream import Stream
 
-def _get_stream(stream: Stream | None = None) -> Stream: ...
+class HasCudaStream(Protocol):
+    def __cuda_stream__(self) -> tuple[int, int]: ...
+
+CudaStreamLike = Stream | HasCudaStream
+
+def _get_stream(stream: CudaStreamLike | None = None) -> Stream: ...
diff --git a/python/pylibcudf/pylibcudf/utils.pyx b/python/pylibcudf/pylibcudf/utils.pyx
index 70460e19481..314e62f7760 100644
--- a/python/pylibcudf/pylibcudf/utils.pyx
+++ b/python/pylibcudf/pylibcudf/utils.pyx
@@ -47,10 +47,12 @@ cdef vector[reference_wrapper[const scalar]] _as_vector(list source):
     return c_scalars
 
 
-cpdef Stream _get_stream(Stream stream = None):
+cpdef Stream _get_stream(object stream = None):
     if stream is None:
         return CUDF_DEFAULT_STREAM
-    return stream
+    if isinstance(stream, Stream):
+        return <Stream>stream
+    return Stream(stream)  # Handles __cuda_stream__ protocol
 
 
 cdef DeviceMemoryResource _get_memory_resource(DeviceMemoryResource mr = None):
diff --git a/python/pylibcudf/tests/test_experimental.py b/python/pylibcudf/tests/test_experimental.py
index eaf06ff62ae..ed180e8db29 100644
--- a/python/pylibcudf/tests/test_experimental.py
+++ b/python/pylibcudf/tests/test_experimental.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 import pytest
 
@@ -21,6 +21,7 @@ def test_join_streams(streams: list[Stream], stream: Stream):
     plc.experimental.join_streams(streams, stream)
 
 
+@pytest.mark.uses_custom_stream
 def test_join_streams_type_error():
     """Test that join_streams raises appropriate errors for invalid inputs."""
     main_stream = Stream()
@@ -29,16 +30,10 @@ def test_join_streams_type_error():
     with pytest.raises(TypeError):
         plc.experimental.join_streams(None, main_stream)
 
-    # Test with non-Stream in list
-    with pytest.raises(
-        TypeError,
-        match="Cannot convert NoneType to rmm.pylibrmm.stream.Stream",
-    ):
-        plc.experimental.join_streams([None], main_stream)
-
-    # Test with non-Stream as main stream
-    with pytest.raises(
-        TypeError,
-        match="Cannot convert NoneType to rmm.pylibrmm.stream.Stream",
-    ):
-        plc.experimental.join_streams([Stream()], None)
+    # Protocol stream should be accepted
+    class _CudaStreamProto:
+        def __cuda_stream__(self):
+            return (0, 0)
+
+    plc.experimental.join_streams([_CudaStreamProto()], main_stream)
+    plc.experimental.join_streams([Stream()], _CudaStreamProto())
diff --git a/python/pylibcudf/tests/test_stream_protocol.py b/python/pylibcudf/tests/test_stream_protocol.py
new file mode 100644
index 00000000000..075c49bd0b3
--- /dev/null
+++ b/python/pylibcudf/tests/test_stream_protocol.py
@@ -0,0 +1,74 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
+import pyarrow as pa
+import pytest
+
+from rmm.pylibrmm.stream import Stream
+
+import pylibcudf as plc
+
+
+class _CudaStreamProto:
+    """Minimal __cuda_stream__ protocol object for testing."""
+
+    def __cuda_stream__(self):
+        return (0, 0)
+
+
+def test_get_stream_none():
+    stream = plc.utils._get_stream(None)
+    assert isinstance(stream, Stream)
+
+
+def test_get_stream_stream_object():
+    stream = Stream()
+    result = plc.utils._get_stream(stream)
+    assert result is stream
+
+
+def test_get_stream_protocol_object():
+    proto = _CudaStreamProto()
+    result = plc.utils._get_stream(proto)
+    assert isinstance(result, Stream)
+
+
+@pytest.mark.parametrize("stream", [None, Stream(), _CudaStreamProto()])
+def test_reduce_accepts_stream_protocol(stream):
+    arr = pa.array([1, 2, 3], type=pa.int32())
+    col = plc.Column.from_arrow(arr)
+    agg = plc.aggregation.sum()
+    dtype = plc.DataType.from_arrow(pa.int32())
+    result = plc.reduce.reduce(col, agg, dtype, stream=stream)
+    assert result.to_py() == 6
+
+
+@pytest.mark.parametrize("stream", [None, Stream(), _CudaStreamProto()])
+def test_binary_operation_accepts_stream_protocol(stream):
+    lhs = plc.Column.from_arrow(pa.array([1, 2, 3], type=pa.int32()))
+    rhs = plc.Column.from_arrow(pa.array([4, 5, 6], type=pa.int32()))
+    dtype = plc.DataType.from_arrow(pa.int32())
+    result = plc.binaryop.binary_operation(
+        lhs,
+        rhs,
+        plc.binaryop.BinaryOperator.ADD,
+        dtype,
+        stream=stream,
+    )
+    expect = pa.array([5, 7, 9], type=pa.int32())
+    assert result.to_arrow().equals(expect)
+
+
+@pytest.mark.parametrize("stream", [None, Stream(), _CudaStreamProto()])
+def test_gather_accepts_stream_protocol(stream):
+    table = plc.Table.from_arrow(pa.table({"a": [1, 2, 3], "b": [4, 5, 6]}))
+    indices = plc.Column.from_arrow(pa.array([2, 0], type=pa.int32()))
+    result = plc.copying.gather(
+        table,
+        indices,
+        plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+        stream=stream,
+    )
+    expected = pa.table({"a": [3, 1], "b": [6, 4]})
+    got = result.to_arrow().rename_columns(expected.column_names)
+    assert got.cast(expected.schema).equals(expected)

From b45c5aaae0b396de9aead82a2f3daf6ba5ef3b10 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 6 May 2026 16:41:46 -0700
Subject: [PATCH 29/36] Use `language: script` for cudf-polars-ir-signatures
 pre-commit hook (#22384)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The `cudf-polars-ir-signatures` pre-commit hook uses `language: python` but is just a local script (`./ci/check_cudf_polars_ir.py`) that only depends on stdlib modules (`ast`, `argparse`, `sys`, `typing`) and has a `#!/usr/bin/env python3` shebang.

With `language: python`, pre-commit unnecessarily creates a virtualenv for this hook. `language: script` is the correct setting — it runs the entry point directly as an executable, relying on the shebang for interpreter selection, with no virtualenv overhead.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/22384
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1fb05425bd3..a51294a8f26 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -136,7 +136,7 @@ repos:
         name: cudf-polars-ir-signatures
         description: 'Validate cudf-polars IR.do_evaluate signatures.'
         entry: ./ci/check_cudf_polars_ir.py
-        language: python
+        language: script
         files: ^python/cudf_polars/cudf_polars/(dsl/ir|experimental/(shuffle|io|sort))\.py$
         pass_filenames: true
         verbose: true

From 8a0d5f951fee350b9557001639dda2002fb7a150 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 6 May 2026 18:14:35 -0700
Subject: [PATCH 30/36] Fix potential errors in Parquet page header decode
 (#22274)

This PR fixes a potential infinite loop in parquet page header count/decode kernels if case of malformed input.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/22274
---
 cpp/src/io/parquet/page_hdr.cu               | 132 ++++++++++---------
 cpp/src/io/parquet/parquet_gpu.hpp           |   2 +
 cpp/src/io/parquet/reader_impl_preprocess.cu |   8 +-
 3 files changed, 81 insertions(+), 61 deletions(-)

diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 83724dd71e2..8e7a6223447 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -110,8 +110,8 @@ inline __device__ int32_t get_i32(byte_stream_s* bs)
  */
 __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
 {
-  int struct_depth = 0;
-  int rep_cnt      = 0;
+  uint32_t struct_depth = 0;
+  uint32_t rep_cnt      = 0;
 
   do {
     if (rep_cnt != 0) {
@@ -138,7 +138,7 @@ __device__ void skip_struct_field(byte_stream_s* bs, int field_type)
       case FieldType::LIST:
       case FieldType::SET: {  // NOTE: skipping a list of lists is not handled
         auto const c = getb(bs);
-        int n        = c >> 4;
+        uint32_t n   = c >> 4;
         if (n == 0xf) { n = get_u32(bs); }
         field_type = c & 0xf;
         if (static_cast<FieldType>(field_type) == FieldType::STRUCT) {
@@ -543,7 +543,6 @@ void __launch_bounds__(decode_page_headers_block_size)
   auto const block = cg::this_thread_block();
   auto const warp  = cg::tiled_partition<cudf::detail::warp_size>(block);
 
-  auto const lane_id = warp.thread_rank();
   auto const warp_id = warp.meta_group_rank();
   auto const chunk_idx =
     static_cast<cudf::size_type>((cg::this_grid().block_rank() * num_warps_per_block) + warp_id);
@@ -554,20 +553,20 @@ void __launch_bounds__(decode_page_headers_block_size)
 
   auto const bs = &bs_g[warp_id];
 
-  if (lane_id == 0) {
+  cg::invoke_one(warp, [&] {
     if (chunk_idx < num_chunks) { bs->ck = chunks[chunk_idx]; }
     error[warp_id] = 0;
-  }
+  });
   block.sync();
 
   if (chunk_idx < num_chunks) {
-    if (lane_id == 0) {
+    cg::invoke_one(warp, [&] {
       bs->base = bs->cur      = bs->ck.compressed_data;
       bs->end                 = bs->base + bs->ck.compressed_size;
       bs->page.chunk_idx      = chunk_idx;
       bs->page.src_col_schema = bs->ck.src_col_schema;
       zero_out_page_header_info(bs);
-    }
+    });
     size_t const num_values        = bs->ck.num_values;
     size_t values_found            = 0;
     uint32_t data_page_count       = 0;
@@ -580,7 +579,7 @@ void __launch_bounds__(decode_page_headers_block_size)
     while (values_found < num_values and bs->cur < bs->end) {
       int index_out = -1;
 
-      if (lane_id == 0) {
+      cg::invoke_one(warp, [&] {
         // this computation is only valid for flat schemas. for nested schemas,
         // they will be recomputed in the preprocess step by examining repetition and
         // definition levels
@@ -593,7 +592,7 @@ void __launch_bounds__(decode_page_headers_block_size)
         bs->page.num_nulls                         = 0;
         bs->page.lvl_bytes[level_type::DEFINITION] = 0;
         bs->page.lvl_bytes[level_type::REPETITION] = 0;
-        if (parse_page_header_fn{}(bs) and bs->page.compressed_page_size >= 0) {
+        if (parse_page_header_fn{}(bs) and bs->page.compressed_page_size > 0) {
           if (not is_supported_encoding(bs->page.encoding)) {
             error[warp_id] |=
               static_cast<kernel_error::value_type>(decode_error::UNSUPPORTED_ENCODING);
@@ -641,11 +640,13 @@ void __launch_bounds__(decode_page_headers_block_size)
           bs->cur = bs->end;
         }
         if (index_out >= 0 and index_out < max_num_pages) { page_info[index_out] = bs->page; }
-      }
+      });
       values_found = shuffle(values_found);
       warp.sync();
     }
-    if (lane_id == 0 and error[warp_id] != 0) { set_error(error[warp_id], error_code); }
+    cg::invoke_one(warp, [&] {
+      if (error[warp_id] != 0) { set_error(error[warp_id], error_code); }
+    });
   }
 }
 
@@ -664,7 +665,6 @@ CUDF_KERNEL void __launch_bounds__(count_page_headers_block_size)
   auto const block = cg::this_thread_block();
   auto const warp  = cg::tiled_partition<cudf::detail::warp_size>(block);
 
-  auto const lane_id = warp.thread_rank();
   auto const warp_id = warp.meta_group_rank();
   auto const chunk_idx =
     static_cast<cudf::size_type>((cg::this_grid().block_rank() * num_warps_per_block) + warp_id);
@@ -675,25 +675,25 @@ CUDF_KERNEL void __launch_bounds__(count_page_headers_block_size)
 
   auto const bs = &bs_g[warp_id];
 
-  if (lane_id == 0) {
+  cg::invoke_one(warp, [&] {
     if (chunk_idx < num_chunks) { bs->ck = chunks[chunk_idx]; }
     error[warp_id] = 0;
-  }
+  });
   block.sync();
 
   if (chunk_idx < num_chunks) {
-    if (lane_id == 0) {
+    cg::invoke_one(warp, [&] {
       bs->base = bs->cur = bs->ck.compressed_data;
       bs->end            = bs->base + bs->ck.compressed_size;
-    }
+    });
     size_t const num_values        = bs->ck.num_values;
     size_t values_found            = 0;
     uint32_t data_page_count       = 0;
     uint32_t dictionary_page_count = 0;
     warp.sync();
     while (values_found < num_values and bs->cur < bs->end) {
-      if (lane_id == 0) {
-        if (parse_page_header_fn{}(bs) and bs->page.compressed_page_size >= 0) {
+      cg::invoke_one(warp, [&] {
+        if (parse_page_header_fn{}(bs) and bs->page.compressed_page_size > 0) {
           if (not is_supported_encoding(bs->page.encoding)) {
             error[warp_id] |=
               static_cast<kernel_error::value_type>(decode_error::UNSUPPORTED_ENCODING);
@@ -724,15 +724,15 @@ CUDF_KERNEL void __launch_bounds__(count_page_headers_block_size)
             static_cast<kernel_error::value_type>(decode_error::INVALID_PAGE_HEADER);
           bs->cur = bs->end;
         }
-      }
+      });
       values_found = shuffle(values_found);
       warp.sync();
     }
-    if (lane_id == 0) {
+    cg::invoke_one(warp, [&] {
       chunks[chunk_idx].num_data_pages = data_page_count;
       chunks[chunk_idx].num_dict_pages = dictionary_page_count;
       if (error[warp_id] != 0) { set_error(error[warp_id], error_code); }
-    }
+    });
   }
 }
 
@@ -784,8 +784,9 @@ struct decode_page_headers_with_pgidx_fn {
     // bs.page.chunk_row not computed here and will be filled in later by
     // `fill_in_page_info()`.
 
-    if (not parse_page_header_fn{}(&bs) or bs.page.compressed_page_size < 0) {
-      set_error(static_cast<kernel_error::value_type>(decode_error::UNSUPPORTED_ENCODING),
+    // Parsed page must be valid and not empty
+    if (not parse_page_header_fn{}(&bs) or bs.page.compressed_page_size <= 0) {
+      set_error(static_cast<kernel_error::value_type>(decode_error::INVALID_PAGE_HEADER),
                 error_code);
       return;
     }
@@ -834,54 +835,64 @@ struct decode_page_headers_with_pgidx_fn {
  * @param[in] num_chunks Number of column chunks
  */
 CUDF_KERNEL void __launch_bounds__(build_string_dict_index_block_size)
-  build_string_dictionary_index_kernel(ColumnChunkDesc* chunks, int32_t num_chunks)
+  build_string_dictionary_index_kernel(ColumnChunkDesc* chunks,
+                                       int32_t num_chunks,
+                                       kernel_error::pointer error_code)
 {
   auto constexpr num_warps_per_block = build_string_dict_index_block_size / cudf::detail::warp_size;
   __shared__ ColumnChunkDesc chunk_g[num_warps_per_block];
 
-  auto const block  = cg::this_thread_block();
-  auto const warp   = cg::tiled_partition<cudf::detail::warp_size>(block);
-  int const lane_id = warp.thread_rank();
-  int const chunk   = (cg::this_grid().block_rank() * num_warps_per_block) + warp.meta_group_rank();
+  auto const block = cg::this_thread_block();
+  auto const warp  = cg::tiled_partition<cudf::detail::warp_size>(block);
+  int const chunk  = (cg::this_grid().block_rank() * num_warps_per_block) + warp.meta_group_rank();
   ColumnChunkDesc* const ck = &chunk_g[warp.meta_group_rank()];
-  if (chunk < num_chunks and lane_id == 0) *ck = chunks[chunk];
+  cg::invoke_one(warp, [&] {
+    if (chunk < num_chunks) { *ck = chunks[chunk]; }
+  });
   block.sync();
 
   if (chunk >= num_chunks) { return; }
-  if (!lane_id && ck->num_dict_pages > 0 && ck->str_dict_index) {
-    // Data type to describe a string
-    string_index_pair* dict_index = ck->str_dict_index;
-    uint8_t const* dict           = ck->dict_page->page_data;
-    int dict_size                 = ck->dict_page->uncompressed_page_size;
-    int num_entries               = ck->dict_page->num_input_values;
-    int pos = 0, cur = 0;
-    for (int i = 0; i < num_entries; i++) {
-      int len = 0;
-      if (ck->physical_type == Type::FIXED_LEN_BYTE_ARRAY) {
-        if (cur + ck->type_length <= dict_size) {
-          len = ck->type_length;
-          pos = cur;
-          cur += len;
-        } else {
-          cur = dict_size;
-        }
-      } else {
-        if (cur + 4 <= dict_size) {
-          len =
-            dict[cur + 0] | (dict[cur + 1] << 8) | (dict[cur + 2] << 16) | (dict[cur + 3] << 24);
-          if (len >= 0 && cur + 4 + len <= dict_size) {
-            pos = cur + 4;
-            cur = pos + len;
+  cg::invoke_one(warp, [&] {
+    if (ck->num_dict_pages > 0 && ck->str_dict_index) {
+      // Data type to describe a string
+      string_index_pair* dict_index = ck->str_dict_index;
+      uint8_t const* dict           = ck->dict_page->page_data;
+      int const dict_size           = ck->dict_page->uncompressed_page_size;
+      int32_t const num_entries     = ck->dict_page->num_input_values;
+      if (num_entries < 0 or dict_size < 0) {
+        set_error(static_cast<kernel_error::value_type>(decode_error::INVALID_DICT_WIDTH),
+                  error_code);
+        return;
+      }
+      int pos = 0, cur = 0;
+      for (int i = 0; i < num_entries; i++) {
+        int len = 0;
+        if (ck->physical_type == Type::FIXED_LEN_BYTE_ARRAY) {
+          if (cur + ck->type_length <= dict_size) {
+            len = ck->type_length;
+            pos = cur;
+            cur += len;
           } else {
             cur = dict_size;
           }
+        } else {
+          if (cur + 4 <= dict_size) {
+            len =
+              dict[cur + 0] | (dict[cur + 1] << 8) | (dict[cur + 2] << 16) | (dict[cur + 3] << 24);
+            if (len >= 0 && cur + 4 + len <= dict_size) {
+              pos = cur + 4;
+              cur = pos + len;
+            } else {
+              cur = dict_size;
+            }
+          }
         }
+        // TODO: Could store 8 entries in shared mem, then do a single warp-wide store
+        dict_index[i].first  = reinterpret_cast<char const*>(dict + pos);
+        dict_index[i].second = len;
       }
-      // TODO: Could store 8 entries in shared mem, then do a single warp-wide store
-      dict_index[i].first  = reinterpret_cast<char const*>(dict + pos);
-      dict_index[i].second = len;
     }
-  }
+  });
 }
 
 }  // namespace
@@ -942,6 +953,7 @@ void decode_page_headers_with_pgidx(cudf::device_span<ColumnChunkDesc const> chu
 
 void build_string_dictionary_index(ColumnChunkDesc* chunks,
                                    int32_t num_chunks,
+                                   kernel_error::pointer error_code,
                                    rmm::cuda_stream_view stream)
 {
   static_assert(
@@ -954,8 +966,8 @@ void build_string_dictionary_index(ColumnChunkDesc* chunks,
   dim3 dim_block(build_string_dict_index_block_size, 1);
   dim3 dim_grid(num_blocks, 1);
 
-  build_string_dictionary_index_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(chunks,
-                                                                                   num_chunks);
+  build_string_dictionary_index_kernel<<<dim_grid, dim_block, 0, stream.value()>>>(
+    chunks, num_chunks, error_code);
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 680d11959a1..7d07f39aa38 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -735,10 +735,12 @@ void decode_page_headers_with_pgidx(cudf::device_span<ColumnChunkDesc const> chu
  *
  * @param[in] chunks List of column chunks
  * @param[in] num_chunks Number of column chunks
+ * @param[out] error_code Pointer to the error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
 void build_string_dictionary_index(ColumnChunkDesc* chunks,
                                    int32_t num_chunks,
+                                   kernel_error::pointer error_code,
                                    rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 07db8ff0c23..8ebb8879d7e 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -94,8 +94,14 @@ void reader_impl::build_string_dict_indices()
     set_str_dict_index_ptr{pass.str_dict_index.data(), str_dict_index_offsets, pass.chunks});
 
   // compute the indices
-  build_string_dictionary_index(pass.chunks.device_ptr(), pass.chunks.size(), _stream);
+  kernel_error error_code(_stream);
+  build_string_dictionary_index(
+    pass.chunks.device_ptr(), pass.chunks.size(), error_code.data(), _stream);
   pass.chunks.device_to_host(_stream);
+  auto const error = error_code.value_sync(_stream);
+  CUDF_EXPECTS(
+    error == 0,
+    "Parquet dictionary index construction failed with code(s) " + kernel_error::to_string(error));
 }
 
 void reader_impl::allocate_nesting_info()

From be407805d297b1b84878d021305eef3fe867f3e5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 6 May 2026 18:55:29 -0700
Subject: [PATCH 31/36] Make RapidsMPF the default runtime for cudf_polars
 streaming executor (#22281)

closes https://github.com/rapidsai/cudf/issues/21466
closes https://github.com/rapidsai/cudf/issues/21767

Waiting for https://github.com/rapidsai/cudf/pull/22212

* Makes rapidsmpf a required dependency of cudf_polars
* Removes the following `StreamingExecutor` options as they were "experimental" with associated code paths
    * `StreamingExecutor.runtime`
    * `StreamingExecutor.shuffle_method`
    * `StreamingExecutor.unique_fraction`
    * `StreamingExecutor.groupby_n_ary`
    * `StreamingExecutor.rapidsmpf_spill`
* Removes the task runtime and associated tests
* Some tests we modified to only test 1 specific test configuration because of https://github.com/rapidsai/cudf/issues/22346 to pass these tests for now. Planning on revisiting this once rapidsmpf becomes the default

Ops-Bot-Merge-Barrier: true

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/22281
---
 .devcontainer/Dockerfile                      |   2 +
 .devcontainer/README.md                       |   8 +
 .../cuda12.9-conda/devcontainer.json          |   5 +-
 .devcontainer/cuda12.9-pip/devcontainer.json  |   7 +-
 .../cuda13.1-conda/devcontainer.json          |   5 +-
 .devcontainer/cuda13.1-pip/devcontainer.json  |   7 +-
 .github/workflows/pr.yaml                     |  41 +-
 ci/test_cudf_polars_experimental.sh           |   2 +-
 ci/test_cudf_polars_polars_tests.sh           |   2 +-
 .../all_cuda-129_arch-aarch64.yaml            |   1 +
 .../all_cuda-129_arch-x86_64.yaml             |   1 +
 .../all_cuda-131_arch-aarch64.yaml            |   1 +
 .../all_cuda-131_arch-x86_64.yaml             |   1 +
 conda/recipes/cudf-polars/recipe.yaml         |   1 +
 dependencies.yaml                             |  56 ++-
 docs/cudf/source/cudf_polars/api.md           |   1 -
 .../cudf/source/cudf_polars/engine_options.md |   2 +-
 python/cudf_polars/cudf_polars/callback.py    |   5 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    |   2 -
 .../cudf_polars/dsl/expressions/base.py       |   8 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      |  10 +-
 .../cudf_polars/experimental/base.py          |  13 +-
 .../benchmarks/utils_new_frontends.py         |  27 +-
 .../cudf_polars/experimental/dispatch.py      |  39 +-
 .../cudf_polars/experimental/distinct.py      |  77 +---
 .../cudf_polars/experimental/explain.py       |   5 +-
 .../cudf_polars/experimental/expressions.py   |  21 +-
 .../cudf_polars/experimental/groupby.py       |  63 +--
 .../cudf_polars/experimental/io.py            | 232 +---------
 .../cudf_polars/experimental/join.py          | 164 +------
 .../cudf_polars/experimental/parallel.py      | 125 +-----
 .../experimental/rapidsmpf/core.py            |   4 -
 .../experimental/rapidsmpf/frontend/core.py   |   2 +-
 .../experimental/rapidsmpf/frontend/dask.py   |   2 -
 .../rapidsmpf/frontend/options.py             |  18 -
 .../experimental/rapidsmpf/frontend/ray.py    |   6 -
 .../experimental/rapidsmpf/frontend/spmd.py   |  21 +-
 .../cudf_polars/experimental/repartition.py   |  43 +-
 .../cudf_polars/experimental/scheduler.py     | 153 -------
 .../cudf_polars/experimental/shuffle.py       | 279 +-----------
 .../cudf_polars/experimental/sort.py          | 402 +-----------------
 .../cudf_polars/experimental/utils.py         |  47 +-
 .../cudf_polars/testing/asserts.py            |   5 +-
 .../cudf_polars/testing/inject_gpu_engine.py  |   3 +-
 .../cudf_polars/cudf_polars/utils/config.py   | 214 ++--------
 .../cudf_polars/utils/cuda_stream.py          |   5 -
 python/cudf_polars/pyproject.toml             |   2 +-
 python/cudf_polars/tests/conftest.py          |  30 +-
 .../tests/experimental/test_dask.py           |   2 -
 .../tests/experimental/test_explain.py        |   4 +-
 .../tests/experimental/test_groupby.py        |   5 +-
 .../tests/experimental/test_hstack.py         |   2 -
 .../tests/experimental/test_options.py        |   5 -
 .../tests/experimental/test_parallel.py       |  41 --
 .../tests/experimental/test_ray.py            |   4 +-
 .../tests/experimental/test_sort.py           |   4 -
 .../tests/experimental/test_spmd.py           |   3 +-
 .../tests/experimental/test_unique.py         |  35 +-
 python/cudf_polars/tests/test_config.py       | 175 ++------
 python/cudf_polars/tests/test_scan.py         |   2 +-
 python/cudf_polars/tests/test_sink.py         |   1 +
 python/cudf_polars/tests/test_tracing.py      |  14 +-
 .../tests/testing/test_engine_utils.py        |   5 -
 63 files changed, 275 insertions(+), 2197 deletions(-)
 delete mode 100644 python/cudf_polars/cudf_polars/experimental/scheduler.py

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index b4b2ecb69e0..57ccf6302c5 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -5,9 +5,11 @@ ARG PYTHON_PACKAGE_MANAGER=conda
 
 FROM ${BASE} as pip-base
 
+# libnuma-dev is required for pip devcontainers for cucascade from rapidsmpf
 RUN apt update -y \
  && DEBIAN_FRONTEND=noninteractive apt install -y \
     librdkafka-dev \
+    libnuma-dev \
  && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*;
 
 ENV DEFAULT_VIRTUAL_ENV=rapids
diff --git a/.devcontainer/README.md b/.devcontainer/README.md
index 91ee7ef85f7..cc5fac22fde 100644
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -20,6 +20,7 @@ This container is a turnkey development environment for building and testing the
 By default, the following directories are bind-mounted into the devcontainer:
 
 * `${repo}:/home/coder/cudf`
+* `${repo}/../rapidsmpf:/home/coder/rapidsmpf`
 * `${repo}/../.aws:/home/coder/.aws`
 * `${repo}/../.local:/home/coder/.local`
 * `${repo}/../.cache:/home/coder/.cache`
@@ -28,6 +29,13 @@ By default, the following directories are bind-mounted into the devcontainer:
 
 This ensures caches, configurations, dependencies, and your commits are persisted on the host across container runs.
 
+The [rapidsmpf](https://github.com/rapidsai/rapidsmpf) repository is a required dependency of `cudf_polars` (that also requires `libcudf`) and must be cloned as a sibling directory to the cudf repo before launching the devcontainer:
+
+```
+# from the parent directory of your cudf clone
+git clone https://github.com/rapidsai/rapidsmpf.git
+```
+
 ## Launch a Dev Container
 
 To launch a devcontainer from VSCode, open the cuDF repo and select the "Reopen in Container" button in the bottom right:<br/><img src="https://user-images.githubusercontent.com/178183/221771999-97ab29d5-e718-4e5f-b32f-2cdd51bba25c.png"/>
diff --git a/.devcontainer/cuda12.9-conda/devcontainer.json b/.devcontainer/cuda12.9-conda/devcontainer.json
index 9d672bdbde8..272007e7c95 100644
--- a/.devcontainer/cuda12.9-conda/devcontainer.json
+++ b/.devcontainer/cuda12.9-conda/devcontainer.json
@@ -49,7 +49,7 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.9-envs}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.9-envs} ${localWorkspaceFolder}/../rapidsmpf"
   ],
   "postAttachCommand": [
     "/bin/bash",
@@ -63,7 +63,8 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.9-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.9-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../rapidsmpf,target=/home/coder/rapidsmpf,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.9-pip/devcontainer.json b/.devcontainer/cuda12.9-pip/devcontainer.json
index f99cc4ce5dc..5012dcfa979 100644
--- a/.devcontainer/cuda12.9-pip/devcontainer.json
+++ b/.devcontainer/cuda12.9-pip/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "12.9",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:26.06-cpp-cuda12.9"
+      "BASE": "rapidsai/devcontainers:26.06-cpp-cuda12.9-ucx1.19.0-openmpi5.0.10"
     }
   },
   "runArgs": [
@@ -27,7 +27,7 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.9-venvs}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.9-venvs} ${localWorkspaceFolder}/../rapidsmpf"
   ],
   "postAttachCommand": [
     "/bin/bash",
@@ -40,7 +40,8 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.9-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.9-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../rapidsmpf,target=/home/coder/rapidsmpf,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda13.1-conda/devcontainer.json b/.devcontainer/cuda13.1-conda/devcontainer.json
index a73953b1989..785302c3c1d 100644
--- a/.devcontainer/cuda13.1-conda/devcontainer.json
+++ b/.devcontainer/cuda13.1-conda/devcontainer.json
@@ -49,7 +49,7 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda13.1-envs}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda13.1-envs} ${localWorkspaceFolder}/../rapidsmpf"
   ],
   "postAttachCommand": [
     "/bin/bash",
@@ -63,7 +63,8 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda13.1-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda13.1-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../rapidsmpf,target=/home/coder/rapidsmpf,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda13.1-pip/devcontainer.json b/.devcontainer/cuda13.1-pip/devcontainer.json
index 8596ff6b503..730b1c1e8ca 100644
--- a/.devcontainer/cuda13.1-pip/devcontainer.json
+++ b/.devcontainer/cuda13.1-pip/devcontainer.json
@@ -5,7 +5,7 @@
     "args": {
       "CUDA": "13.1",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:26.06-cpp-cuda13.1"
+      "BASE": "rapidsai/devcontainers:26.06-cpp-cuda13.1-ucx1.19.0-openmpi5.0.10"
     }
   },
   "runArgs": [
@@ -27,7 +27,7 @@
   "initializeCommand": [
     "/bin/bash",
     "-c",
-    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda13.1-venvs}"
+    "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda13.1-venvs} ${localWorkspaceFolder}/../rapidsmpf"
   ],
   "postAttachCommand": [
     "/bin/bash",
@@ -40,7 +40,8 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda13.1-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda13.1-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent",
+    "source=${localWorkspaceFolder}/../rapidsmpf,target=/home/coder/rapidsmpf,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c20f7f7ea79..a7ef36049b0 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -537,23 +537,30 @@ jobs:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: ci/test_wheel_dask_cudf.sh
-  # devcontainer:
-  #   secrets: inherit
-  #   needs: telemetry-setup
-  #   uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@main
-  #   with:
-  #     arch: '["amd64", "arm64"]'
-  #     cuda: '["13.1"]'
-  #     node_type: "cpu8"
-  #     timeout-minutes: 90
-  #     env: |
-  #       SCCACHE_DIST_MAX_RETRIES=inf
-  #       SCCACHE_SERVER_LOG=sccache=debug
-  #       SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false
-  #     build_command: |
-  #       sccache --zero-stats;
-  #       build-all -j0 -DBUILD_BENCHMARKS=ON --verbose 2>&1 | tee telemetry-artifacts/build.log;
-  #       sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
+  devcontainer:
+    secrets: inherit
+    needs: telemetry-setup
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@main
+    with:
+      arch: '["amd64", "arm64"]'
+      cuda: '["13.1"]'
+      node_type: "cpu8"
+      timeout-minutes: 90
+      env: |
+        SCCACHE_DIST_MAX_RETRIES=inf
+        SCCACHE_SERVER_LOG=sccache=debug
+        SCCACHE_DIST_FALLBACK_TO_LOCAL_COMPILE=false
+      # clone-rapidsmpf to not use the rapidsmpf wheels from cudf_polars dependency
+      # librapidsmpf-cu13 wheels brings in a hardcoded libnuma-dev cmake target: https://github.com/NVIDIA/cuCascade/issues/118
+      # -DBUILD_TESTS=OFF to match rapidsmpf https://github.com/rapidsai/rapidsmpf/blob/main/.github/workflows/pr.yaml#L351 (leads to compilation errors)
+      # -DCUDF_BUILD_TESTUTIL=OFF to avoid IMPORTED_GLOBAL promotion errors when cuCascade's find_package(cudf) loads cudf-config.cmake from a CPM subdirectory
+      build_command: |
+        sccache --zero-stats;
+        clone-rapidsmpf -j$(nproc) -v -q --branch "$(cat ~/cudf/RAPIDS_BRANCH)" --clone-upstream --depth 1 --single-branch --shallow-submodules;
+        if [ "$PYTHON_PACKAGE_MANAGER" = "pip" ]; then rapids-make-pip-env --force; elif [ "$PYTHON_PACKAGE_MANAGER" = "conda" ]; then rapids-make-conda-env --force; fi;
+        rapids-generate-scripts;
+        build-all -j0 -DBUILD_BENCHMARKS=OFF -DBUILD_NUMA_SUPPORT=OFF -DBUILD_TESTS=OFF -DCUDF_BUILD_TESTUTIL=OFF --verbose 2>&1 | tee telemetry-artifacts/build.log;
+        sccache --show-adv-stats | tee telemetry-artifacts/sccache-stats.txt;
   unit-tests-cudf-pandas:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
diff --git a/ci/test_cudf_polars_experimental.sh b/ci/test_cudf_polars_experimental.sh
index 02eab86c0dd..aa3abd66254 100755
--- a/ci/test_cudf_polars_experimental.sh
+++ b/ci/test_cudf_polars_experimental.sh
@@ -28,7 +28,7 @@ rapids-pip-retry install \
     -v \
     --prefer-binary \
     --constraint "${PIP_CONSTRAINT}" \
-    "$(echo "${CUDF_POLARS_WHEELHOUSE}"/cudf_polars_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test,experimental,rapidsmpf]" \
+    "$(echo "${CUDF_POLARS_WHEELHOUSE}"/cudf_polars_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test,experimental]" \
     "$(echo "${LIBCUDF_WHEELHOUSE}"/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
     "$(echo "${PYLIBCUDF_WHEELHOUSE}"/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)"
 
diff --git a/ci/test_cudf_polars_polars_tests.sh b/ci/test_cudf_polars_polars_tests.sh
index 802110b18ac..4e19464a895 100755
--- a/ci/test_cudf_polars_polars_tests.sh
+++ b/ci/test_cudf_polars_polars_tests.sh
@@ -27,7 +27,7 @@ rapids-logger "Install libcudf, pylibcudf and cudf_polars"
 rapids-pip-retry install \
     -v \
     --constraint "${PIP_CONSTRAINT}" \
-    "$(echo "${CUDF_POLARS_WHEELHOUSE}"/cudf_polars_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test,rapidsmpf]" \
+    "$(echo "${CUDF_POLARS_WHEELHOUSE}"/cudf_polars_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" \
     "$(echo "${LIBCUDF_WHEELHOUSE}"/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
     "$(echo "${PYLIBCUDF_WHEELHOUSE}"/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)"
 
diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index d1e0e59f79b..e0b3aa994f4 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -85,6 +85,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-dask-dependency==26.6.*,>=0.0.0a0
 - rapids-logger==0.2.*,>=0.0.0a0
+- rapidsmpf==26.6.*,>=0.0.0a0
 - rich
 - rmm==26.6.*,>=0.0.0a0
 - s3fs>=2022.3.0
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index c080f732b55..3255ce84837 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -85,6 +85,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-dask-dependency==26.6.*,>=0.0.0a0
 - rapids-logger==0.2.*,>=0.0.0a0
+- rapidsmpf==26.6.*,>=0.0.0a0
 - rich
 - rmm==26.6.*,>=0.0.0a0
 - s3fs>=2022.3.0
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
index 3194a087894..44314a0544a 100644
--- a/conda/environments/all_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -85,6 +85,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-dask-dependency==26.6.*,>=0.0.0a0
 - rapids-logger==0.2.*,>=0.0.0a0
+- rapidsmpf==26.6.*,>=0.0.0a0
 - rich
 - rmm==26.6.*,>=0.0.0a0
 - s3fs>=2022.3.0
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
index f81ec9b08d0..949a3602e4f 100644
--- a/conda/environments/all_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -85,6 +85,7 @@ dependencies:
 - rapids-build-backend>=0.4.0,<0.5.0
 - rapids-dask-dependency==26.6.*,>=0.0.0a0
 - rapids-logger==0.2.*,>=0.0.0a0
+- rapidsmpf==26.6.*,>=0.0.0a0
 - rich
 - rmm==26.6.*,>=0.0.0a0
 - s3fs>=2022.3.0
diff --git a/conda/recipes/cudf-polars/recipe.yaml b/conda/recipes/cudf-polars/recipe.yaml
index e3a21aa1afd..52ac74c7c8b 100644
--- a/conda/recipes/cudf-polars/recipe.yaml
+++ b/conda/recipes/cudf-polars/recipe.yaml
@@ -36,6 +36,7 @@ requirements:
     - nvidia-ml-py>=12
     - python
     - pylibcudf =${{ version }}
+    - rapidsmpf =${{ minor_version }}
     - polars>=1.30,<1.39
     - packaging
     - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
diff --git a/dependencies.yaml b/dependencies.yaml
index cae4816eec5..af7dfea460f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -8,6 +8,51 @@ files:
     matrix:
       cuda: ["12.9", "13.1"]
       arch: [x86_64, aarch64]
+    includes:
+      - build_base
+      - build_all
+      - build_cpp
+      - build_python_common
+      - clang
+      - cuda
+      - cuda_version
+      - cudf_polars_trace
+      - depends_on_cupy
+      - depends_on_cuda_python
+      - depends_on_dask_cuda
+      - depends_on_libkvikio
+      - depends_on_librmm
+      - depends_on_libnvcomp
+      - depends_on_numba_cuda
+      - depends_on_rapids_logger
+      - depends_on_rapidsmpf
+      - depends_on_rmm
+      - develop
+      - docs
+      - iwyu
+      - notebooks
+      - numpy_run
+      - py_version
+      - pyarrow_run
+      - rapids_build_skbuild
+      - rapids_build_setuptools
+      - run_common
+      - run_cudf
+      - run_cudf_polars
+      - run_pylibcudf
+      - run_dask_cudf
+      - run_custreamz
+      - test_cpp
+      - test_python_common
+      - test_python_cudf
+      - test_python_cudf_common
+      - test_python_pylibcudf
+      - test_python_cudf_pandas
+      - test_python_cudf_polars
+      - test_python_s3
+  devcontainers:
+    output: none
+    # Same as "all", excluding depends_on_rapidsmpf (which is built from source)
     includes:
       - build_base
       - build_all
@@ -328,6 +373,7 @@ files:
       table: project
     includes:
       - run_cudf_polars
+      - depends_on_rapidsmpf
       - depends_on_pylibcudf
       - depends_on_cuda_python
   py_run_cudf_polars_experimental:
@@ -338,15 +384,6 @@ files:
       key: experimental
     includes:
       - run_cudf_polars_experimental
-  py_run_cudf_polars_rapidsmpf:
-    output: pyproject
-    pyproject_dir: python/cudf_polars
-    extras:
-      table: project.optional-dependencies
-      key: rapidsmpf
-    includes:
-      - depends_on_rapidsmpf
-      - pyarrow_run
   py_test_cudf_polars:
     output: pyproject
     pyproject_dir: python/cudf_polars
@@ -360,7 +397,6 @@ files:
       - test_python_common
       - test_python_cudf_polars
       - cudf_polars_trace
-      - depends_on_rapidsmpf
   py_trace_cudf_polars:
     output: pyproject
     pyproject_dir: python/cudf_polars
diff --git a/docs/cudf/source/cudf_polars/api.md b/docs/cudf/source/cudf_polars/api.md
index 741b2f6f758..823954a3b08 100644
--- a/docs/cudf/source/cudf_polars/api.md
+++ b/docs/cudf/source/cudf_polars/api.md
@@ -13,7 +13,6 @@ For the most part, the public API of `cudf-polars` is the polars API.
       ExecutorType,
       InMemoryExecutor,
       ParquetOptions,
-      ShuffleMethod,
       StreamingExecutor,
       StreamingFallbackMode,
 ```
diff --git a/docs/cudf/source/cudf_polars/engine_options.md b/docs/cudf/source/cudf_polars/engine_options.md
index 67e601467d9..ba6085275b8 100644
--- a/docs/cudf/source/cudf_polars/engine_options.md
+++ b/docs/cudf/source/cudf_polars/engine_options.md
@@ -52,7 +52,7 @@ For example, the environment variable
 `max_rows_per_partition` to use if it isn't overridden through
 `executor_options`.
 
-For boolean options, like `rapidsmpf_spill`, the values `{"1", "true", "yes", "y"}`
+For boolean options, like `sink_to_directory`, the values `{"1", "true", "yes", "y"}`
 are considered `True` and `{"0", "false", "no", "n"}` are considered `False`.
 
 See [Configuration Reference](#cudf-polars-api) for a full list of options, and
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index fb915784f96..acd0452ae1b 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -159,13 +159,12 @@ def set_memory_resource(
     """
     previous = rmm.mr.get_current_device_resource()
     if mr is None:
-        # Use cuda async by default with the rapidsmpf runtime.
+        # Use cuda async by default with the streaming executor.
         if (
             memory_resource_config is None
             and executor.name == "streaming"
-            and executor.runtime == "rapidsmpf"
             and (device_size := get_total_device_memory()) is not None
-        ):  # pragma: no cover; Requires rapidsmpf runtime.
+        ):  # pragma: no cover
             memory_resource_config = MemoryResourceConfig(
                 qualname="rmm.mr.CudaAsyncMemoryResource",
                 options={
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 5dd8898bde2..b21485ac41e 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -15,7 +15,6 @@
 
 from cudf_polars.dsl.expressions.aggregation import Agg
 from cudf_polars.dsl.expressions.base import (
-    AggInfo,
     Col,
     ColRef,
     ErrorExpr,
@@ -37,7 +36,6 @@
 
 __all__ = [
     "Agg",
-    "AggInfo",
     "BinOp",
     "BooleanFunction",
     "Cast",
diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/base.py b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
index 3336ea3fd7c..b97be71b771 100644
--- a/python/cudf_polars/cudf_polars/dsl/expressions/base.py
+++ b/python/cudf_polars/cudf_polars/dsl/expressions/base.py
@@ -8,7 +8,7 @@
 
 import enum
 from enum import IntEnum
-from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
+from typing import TYPE_CHECKING, Any, ClassVar
 
 import pylibcudf as plc
 
@@ -20,11 +20,7 @@
 
     from cudf_polars.containers import Column, DataFrame, DataType
 
-__all__ = ["AggInfo", "Col", "ColRef", "ExecutionContext", "Expr", "NamedExpr"]
-
-
-class AggInfo(NamedTuple):
-    requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]]
+__all__ = ["Col", "ColRef", "ExecutionContext", "Expr", "NamedExpr"]
 
 
 class ExecutionContext(IntEnum):
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index bee30183e1c..1c48f70bb11 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -819,11 +819,13 @@ def read_csv_header(
                 # TODO: Nested column names
                 names = chunk.column_names(include_children=False)
                 concatenated_columns = chunk.tbl.columns()
-                while reader.has_next():
+                while reader.has_next():  # pragma: no cover
                     columns = reader.read_chunk().tbl.columns()
                     # Discard columns while concatenating to reduce memory footprint.
                     # Reverse order to avoid O(n^2) list popping cost.
-                    for i in range(len(concatenated_columns) - 1, -1, -1):
+                    for i in range(  # pragma: no cover
+                        len(concatenated_columns) - 1, -1, -1
+                    ):
                         concatenated_columns[i] = plc.concatenate.concatenate(
                             [concatenated_columns[i], columns.pop()], stream=stream
                         )
@@ -840,7 +842,7 @@ def read_csv_header(
                     num_rows=num_rows,
                 )
                 if include_file_paths is not None:
-                    df = Scan.add_file_paths(
+                    df = Scan.add_file_paths(  # pragma: no cover
                         include_file_paths, paths, chunk.num_rows_per_source, df
                     )
             else:
@@ -1164,7 +1166,7 @@ def _write_parquet(
             | plc.io.parquet.ParquetWriterOptionsBuilder
         )
 
-        if (
+        if (  # pragma: no cover
             parquet_options.chunked
             and parquet_options.n_output_chunks != 1
             and df.table.num_rows() != 0
diff --git a/python/cudf_polars/cudf_polars/experimental/base.py b/python/cudf_polars/cudf_polars/experimental/base.py
index 73ed9b3dbe1..80ff0dfacbd 100644
--- a/python/cudf_polars/cudf_polars/experimental/base.py
+++ b/python/cudf_polars/cudf_polars/experimental/base.py
@@ -11,11 +11,10 @@
 from cudf_polars.dsl.traversal import traversal
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Iterator
+    from collections.abc import Generator
 
     from cudf_polars.dsl.expr import NamedExpr
     from cudf_polars.dsl.ir import IR
-    from cudf_polars.dsl.nodebase import Node
 
 
 class PartitionInfo:
@@ -40,22 +39,12 @@ def __init__(
         self.partitioned_on = partitioned_on
         self.io_plan = io_plan
 
-    def keys(self, node: Node) -> Iterator[tuple[str, int]]:
-        """Return the partitioned keys for a given node."""
-        name = get_key_name(node)
-        yield from ((name, i) for i in range(self.count))
-
     def __rich_repr__(self) -> Generator[Any, None, None]:
         """Formatting for rich.pretty.pprint."""
         yield "count", self.count
         yield "partitioned_on", self.partitioned_on
 
 
-def get_key_name(node: Node) -> str:
-    """Generate the key name for a Node."""
-    return f"{type(node).__name__.lower()}-{hash(node)}"
-
-
 class SerializedDataSourceInfo(TypedDict):
     """The serialized form of DataSourceInfo."""
 
diff --git a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils_new_frontends.py b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils_new_frontends.py
index d514d4c44e9..74386993737 100644
--- a/python/cudf_polars/cudf_polars/experimental/benchmarks/utils_new_frontends.py
+++ b/python/cudf_polars/cudf_polars/experimental/benchmarks/utils_new_frontends.py
@@ -636,26 +636,8 @@ def get_executor_options(
     executor_options: dict[str, Any] = (
         run_config.streaming_options.to_executor_options()
     )
-    executor_options["runtime"] = "rapidsmpf"
     executor_options["max_io_threads"] = run_config.max_io_threads
 
-    # PDSHQueries: inject unique_fraction when dynamic planning is explicitly disabled
-    if (
-        benchmark
-        and benchmark.__name__ == "PDSHQueries"
-        and run_config.executor == "streaming"
-        and run_config.streaming_options.dynamic_planning is None
-    ):
-        executor_options.setdefault(
-            "unique_fraction",
-            {
-                "c_custkey": 0.05,
-                "l_orderkey": 1.0,
-                "l_partkey": 0.1,
-                "o_custkey": 0.25,
-            },
-        )
-
     return executor_options
 
 
@@ -1110,8 +1092,7 @@ def run_polars_spmd(
     from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
 
     executor_options = get_executor_options(run_config, benchmark=benchmark)
-    # "runtime" and "cluster" are reserved — SPMDEngine sets them
-    executor_options.pop("runtime", None)
+    # "cluster" is reserved — SPMDEngine sets it
     executor_options.pop("cluster", None)
     engine_options = {
         **run_config.streaming_options.to_engine_options(),
@@ -1168,8 +1149,7 @@ def run_polars_ray(
     from cudf_polars.experimental.rapidsmpf.frontend.ray import RayEngine
 
     executor_options = get_executor_options(run_config, benchmark=benchmark)
-    # "runtime", "cluster" are reserved — RayEngine sets them
-    executor_options.pop("runtime", None)
+    # "cluster" is reserved — RayEngine sets it
     executor_options.pop("cluster", None)
     engine_options: dict[str, Any] = {
         **run_config.streaming_options.to_engine_options(),
@@ -1218,8 +1198,7 @@ def run_polars_dask(
     from cudf_polars.experimental.rapidsmpf.frontend.dask import DaskEngine
 
     executor_options = get_executor_options(run_config, benchmark=benchmark)
-    # "runtime", "cluster" are reserved — DaskEngine sets them
-    executor_options.pop("runtime", None)
+    # "cluster" is reserved — DaskEngine sets it
     executor_options.pop("cluster", None)
     engine_options: dict[str, Any] = {
         **run_config.streaming_options.to_engine_options(),
diff --git a/python/cudf_polars/cudf_polars/experimental/dispatch.py b/python/cudf_polars/cudf_polars/experimental/dispatch.py
index 3ac67b6af46..9ff0cc3156b 100644
--- a/python/cudf_polars/cudf_polars/experimental/dispatch.py
+++ b/python/cudf_polars/cudf_polars/experimental/dispatch.py
@@ -5,7 +5,7 @@
 from __future__ import annotations
 
 from functools import singledispatch
-from typing import TYPE_CHECKING, Any, TypeAlias, TypedDict
+from typing import TYPE_CHECKING, TypeAlias, TypedDict
 
 from cudf_polars.typing import GenericTransformer
 
@@ -13,7 +13,7 @@
     from collections.abc import MutableMapping
 
     from cudf_polars.dsl import ir
-    from cudf_polars.dsl.ir import IR, IRExecutionContext
+    from cudf_polars.dsl.ir import IR
     from cudf_polars.experimental.base import (
         PartitionInfo,
         StatsCollector,
@@ -72,38 +72,3 @@ def lower_ir_node(
     lower_ir_graph
     """
     raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
-
-
-@singledispatch
-def generate_ir_tasks(
-    ir: IR,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    context: IRExecutionContext,
-) -> MutableMapping[Any, Any]:
-    """
-    Generate a task graph for evaluation of an IR node.
-
-    Parameters
-    ----------
-    ir
-        IR node to generate tasks for.
-    partition_info
-        Partitioning information, obtained from :func:`lower_ir_graph`.
-    context
-        Runtime context for IR node execution.
-
-    Returns
-    -------
-    mapping
-        A (partial) dask task graph for the evaluation of an ir node.
-
-    Notes
-    -----
-    Task generation should only produce the tasks for the current node,
-    referring to child tasks by name.
-
-    See Also
-    --------
-    task_graph
-    """
-    raise AssertionError(f"Unhandled type {type(ir)}")  # pragma: no cover
diff --git a/python/cudf_polars/cudf_polars/experimental/distinct.py b/python/cudf_polars/cudf_polars/experimental/distinct.py
index 9ae148f77d3..564fe570919 100644
--- a/python/cudf_polars/cudf_polars/experimental/distinct.py
+++ b/python/cudf_polars/cudf_polars/experimental/distinct.py
@@ -17,8 +17,6 @@
 from cudf_polars.experimental.shuffle import Shuffle
 from cudf_polars.experimental.utils import (
     _dynamic_planning_on,
-    _fallback_inform,
-    _get_unique_fractions,
     _lower_ir_fallback,
 )
 
@@ -35,8 +33,6 @@ def lower_distinct(
     child: IR,
     partition_info: MutableMapping[IR, PartitionInfo],
     config_options: ConfigOptions[StreamingExecutor],
-    *,
-    unique_fraction: float | None = None,
 ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
     """
     Lower a Distinct IR into partition-wise stages.
@@ -56,9 +52,6 @@ def lower_distinct(
         associated partitioning information.
     config_options
         GPUEngine configuration options.
-    unique_fraction
-        Fraction of unique values to total values. Used for algorithm selection.
-        A value of `1.0` means the column is unique.
 
     Returns
     -------
@@ -68,69 +61,24 @@ def lower_distinct(
         A mapping from unique nodes in the new graph to associated
         partitioning information.
     """
-    subset: frozenset[str] = ir.subset or frozenset(ir.schema)
-    distinct_keys = tuple(
-        NamedExpr(name, Col(ir.schema[name], name))
-        for name in ir.schema
-        if name in subset
-    )
-
     child_count = partition_info[child].count
-    shuffled = partition_info[child].partitioned_on == distinct_keys
 
-    # Check for ordering requirements (shuffle is not stable)
-    require_tree_reduction = ir.stable or ir.keep in (
-        plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
-        plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
-    )
-
-    output_count = 1
-    n_ary = 32  # Arbitrary default (for now)
+    n_ary = 32
     if ir.zlice is not None and ir.zlice[1] is not None:
-        # Head/tail slice operation has been pushed into Distinct
-        # (caller ensures only simple slices reach here)
         n_ary = max(1_000_000 // ir.zlice[1], 2)
-    elif unique_fraction is not None:
-        # Use unique_fraction to determine partitioning
-        n_ary = min(max(int(1.0 / unique_fraction), 2), child_count)
-        output_count = max(int(unique_fraction * child_count), 1)
-
-    if output_count > 1 and require_tree_reduction:
-        # Need to reduce down to a single partition even
-        # if the unique_fraction is large.
-        output_count = 1
-        _fallback_inform(
-            "Unsupported unique options for multiple partitions.",
-            config_options,
-        )
 
     # Partition-wise unique
     count = child_count
     new_node: IR = ir.reconstruct([child])
     partition_info[new_node] = PartitionInfo(count=count)
 
-    if shuffled or output_count == 1:
-        # Tree reduction
-        while count > output_count:
-            new_node = Repartition(new_node.schema, new_node)
-            count = max(math.ceil(count / n_ary), output_count)
-            partition_info[new_node] = PartitionInfo(count=count)
-            new_node = ir.reconstruct([new_node])
-            partition_info[new_node] = PartitionInfo(count=count)
-    else:
-        # Shuffle
-        new_node = Shuffle(
-            new_node.schema,
-            distinct_keys,
-            config_options.executor.shuffle_method,
-            new_node,
-        )
-        partition_info[new_node] = PartitionInfo(count=output_count)
+    # Tree reduction
+    while count > 1:
+        new_node = Repartition(new_node.schema, new_node)
+        count = max(math.ceil(count / n_ary), 1)
+        partition_info[new_node] = PartitionInfo(count=count)
         new_node = ir.reconstruct([new_node])
-        partition_info[new_node] = PartitionInfo(
-            count=output_count,
-            partitioned_on=distinct_keys,
-        )
+        partition_info[new_node] = PartitionInfo(count=count)
 
     return new_node, partition_info
 
@@ -172,7 +120,6 @@ def _(
             child = Shuffle(
                 child.schema,
                 distinct_keys,
-                config_options.executor.shuffle_method,
                 child,
             )
             partition_info[child] = PartitionInfo(
@@ -202,19 +149,9 @@ def _(
         )
         return dynamic_node, partition_info
 
-    # Non-dynamic planning: use unique_fraction heuristics
-    unique_fraction_dict = _get_unique_fractions(
-        tuple(subset),
-        config_options.executor.unique_fraction,
-    )
-    unique_fraction = (
-        max(unique_fraction_dict.values()) if unique_fraction_dict else None
-    )
-
     return lower_distinct(
         ir,
         child,
         partition_info,
         config_options,
-        unique_fraction=unique_fraction,
     )
diff --git a/python/cudf_polars/cudf_polars/experimental/explain.py b/python/cudf_polars/cudf_polars/experimental/explain.py
index 82f023b229c..d50d9fae0ae 100644
--- a/python/cudf_polars/cudf_polars/experimental/explain.py
+++ b/python/cudf_polars/cudf_polars/experimental/explain.py
@@ -288,10 +288,7 @@ def _(ir: GroupBy) -> dict[str, Serializable]:
 
 @_serialize_properties.register
 def _(ir: Shuffle) -> dict[str, Serializable]:
-    return {
-        "keys": [ne.name for ne in ir.keys],
-        "shuffle_method": ir.shuffle_method.value,
-    }
+    return {"keys": [ne.name for ne in ir.keys]}
 
 
 @_serialize_properties.register
diff --git a/python/cudf_polars/cudf_polars/experimental/expressions.py b/python/cudf_polars/cudf_polars/experimental/expressions.py
index d2a0070d009..d6df4cae8f9 100644
--- a/python/cudf_polars/cudf_polars/experimental/expressions.py
+++ b/python/cudf_polars/cudf_polars/experimental/expressions.py
@@ -41,22 +41,18 @@
 
 from cudf_polars.containers import DataType
 from cudf_polars.dsl.expressions.aggregation import Agg
-from cudf_polars.dsl.expressions.base import Col, ExecutionContext, Expr, NamedExpr
+from cudf_polars.dsl.expressions.base import Col, ExecutionContext, NamedExpr
 from cudf_polars.dsl.expressions.binaryop import BinOp
 from cudf_polars.dsl.expressions.literal import Literal
 from cudf_polars.dsl.expressions.ternary import Ternary
 from cudf_polars.dsl.expressions.unary import Cast, Len, UnaryFunction
-from cudf_polars.dsl.ir import IR, Distinct, Empty, HConcat, Select
+from cudf_polars.dsl.ir import Distinct, Empty, HConcat, Select
 from cudf_polars.dsl.traversal import (
     CachingVisitor,
 )
 from cudf_polars.experimental.base import PartitionInfo
 from cudf_polars.experimental.repartition import Repartition
-from cudf_polars.experimental.utils import (
-    _dynamic_planning_on,
-    _get_unique_fractions,
-    _leaf_column_names,
-)
+from cudf_polars.experimental.utils import _dynamic_planning_on
 
 if TYPE_CHECKING:
     from collections.abc import Generator, MutableMapping, Sequence
@@ -197,15 +193,6 @@ def _decompose_unique(
     )
     (column,) = columns
 
-    unique_fraction_dict = _get_unique_fractions(
-        _leaf_column_names(child),
-        config_options.executor.unique_fraction,
-    )
-
-    unique_fraction = (
-        max(unique_fraction_dict.values()) if unique_fraction_dict else None
-    )
-
     input_ir, partition_info = lower_distinct(
         Distinct(
             {column.name: column.dtype},
@@ -218,7 +205,6 @@ def _decompose_unique(
         input_ir,
         partition_info,
         config_options,
-        unique_fraction=unique_fraction,
     )
 
     return column, input_ir, partition_info
@@ -344,7 +330,6 @@ def _decompose_agg_node(
             input_ir = Shuffle(
                 input_ir.schema,
                 shuffle_on,
-                config_options.executor.shuffle_method,
                 input_ir,
             )
             partition_info[input_ir] = PartitionInfo(
diff --git a/python/cudf_polars/cudf_polars/experimental/groupby.py b/python/cudf_polars/cudf_polars/experimental/groupby.py
index 898dfdbf03f..6a17b56bfc5 100644
--- a/python/cudf_polars/cudf_polars/experimental/groupby.py
+++ b/python/cudf_polars/cudf_polars/experimental/groupby.py
@@ -36,7 +36,6 @@
 from cudf_polars.experimental.shuffle import Shuffle
 from cudf_polars.experimental.utils import (
     _dynamic_planning_on,
-    _get_unique_fractions,
     _lower_ir_fallback,
 )
 
@@ -390,7 +389,6 @@ def _(
 
     # Check if we are dealing with any high-cardinality columns
     post_aggregation_count = 1  # Default tree reduction
-    groupby_key_columns = [ne.name for ne in ir.keys]
     shuffled = partition_info[child].partitioned_on == ir.keys
     child_count = partition_info[child].count
 
@@ -421,7 +419,6 @@ def _(
         child = Shuffle(
             child.schema,
             ir.keys,
-            config_options.executor.shuffle_method,
             child,
         )
         partition_info[child] = PartitionInfo(
@@ -441,14 +438,6 @@ def _(
         )
         return dynamic_node, partition_info
 
-    if unique_fraction_dict := _get_unique_fractions(
-        groupby_key_columns,
-        config_options.executor.unique_fraction,
-    ):
-        # Use unique_fraction to determine output partitioning
-        unique_fraction = max(unique_fraction_dict.values())
-        post_aggregation_count = max(int(unique_fraction * child_count), 1)
-
     # Partition-wise groupby operation
     pwise_schema = {k.name: k.value.dtype for k in ir.keys} | {
         k.name: k.value.dtype for k in piecewise_exprs
@@ -465,46 +454,28 @@ def _(
     partition_info[gb_pwise] = PartitionInfo(count=child_count)
     grouped_keys = tuple(NamedExpr(k.name, Col(k.value.dtype, k.name)) for k in ir.keys)
 
-    # Reduction
-    gb_inter: GroupBy | Repartition | Shuffle
+    # N-ary tree reduction
+    gb_inter: GroupBy | Repartition
     reduction_schema = {k.name: k.value.dtype for k in grouped_keys} | {
         k.name: k.value.dtype for k in reduction_exprs
     }
-    if not shuffled and post_aggregation_count > 1:
-        # Shuffle reduction
-        if ir.maintain_order:  # pragma: no cover
-            return _lower_ir_fallback(
-                ir,
-                rec,
-                msg="maintain_order not supported for multiple output partitions.",
+    n_ary = 32
+    count = child_count
+    gb_inter = gb_pwise
+    while count > post_aggregation_count:
+        gb_inter = Repartition(gb_inter.schema, gb_inter)
+        count = max(math.ceil(count / n_ary), post_aggregation_count)
+        partition_info[gb_inter] = PartitionInfo(count=count)
+        if count > post_aggregation_count:
+            gb_inter = GroupBy(
+                reduction_schema,
+                grouped_keys,
+                reduction_exprs,
+                ir.maintain_order,
+                None,
+                gb_inter,
             )
-
-        gb_inter = Shuffle(
-            gb_pwise.schema,
-            grouped_keys,
-            config_options.executor.shuffle_method,
-            gb_pwise,
-        )
-        partition_info[gb_inter] = PartitionInfo(count=post_aggregation_count)
-    else:
-        # N-ary tree reduction
-        n_ary = config_options.executor.groupby_n_ary
-        count = child_count
-        gb_inter = gb_pwise
-        while count > post_aggregation_count:
-            gb_inter = Repartition(gb_inter.schema, gb_inter)
-            count = max(math.ceil(count / n_ary), post_aggregation_count)
             partition_info[gb_inter] = PartitionInfo(count=count)
-            if count > post_aggregation_count:
-                gb_inter = GroupBy(
-                    reduction_schema,
-                    grouped_keys,
-                    reduction_exprs,
-                    ir.maintain_order,
-                    None,
-                    gb_inter,
-                )
-                partition_info[gb_inter] = PartitionInfo(count=count)
 
     # Final aggregation
     gb_reduce = GroupBy(
diff --git a/python/cudf_polars/cudf_polars/experimental/io.py b/python/cudf_polars/cudf_polars/experimental/io.py
index f45baa054dd..2cea0274ee6 100644
--- a/python/cudf_polars/cudf_polars/experimental/io.py
+++ b/python/cudf_polars/cudf_polars/experimental/io.py
@@ -4,13 +4,11 @@
 
 from __future__ import annotations
 
-import dataclasses
 import functools
 import itertools
 import math
 import statistics
 from collections import defaultdict
-from functools import partial
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal, overload
 
@@ -24,16 +22,14 @@
     Empty,
     Scan,
     Sink,
-    Union,
 )
 from cudf_polars.experimental.base import (
     IOPartitionFlavor,
     IOPartitionPlan,
     PartitionInfo,
     SerializedDataSourceInfo,
-    get_key_name,
 )
-from cudf_polars.experimental.dispatch import generate_ir_tasks, lower_ir_node
+from cudf_polars.experimental.dispatch import lower_ir_node
 from cudf_polars.utils.config import Cluster
 from cudf_polars.utils.cuda_stream import get_cuda_stream
 from cudf_polars.utils.versions import POLARS_VERSION_LT_137
@@ -62,36 +58,9 @@
 def _(
     ir: DataFrameScan, rec: LowerIRTransformer
 ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
-    config_options = rec.state["config_options"]
+    from cudf_polars.experimental.rapidsmpf.io import lower_dataframescan_rapidsmpf
 
-    # RapidsMPF runtime: Use rapidsmpf-specific lowering
-    if (
-        config_options.executor.runtime == "rapidsmpf"
-    ):  # pragma: no cover; Requires rapidsmpf runtime
-        from cudf_polars.experimental.rapidsmpf.io import lower_dataframescan_rapidsmpf
-
-        return lower_dataframescan_rapidsmpf(ir, rec)
-
-    rows_per_partition = config_options.executor.max_rows_per_partition
-    nrows = max(ir.df.shape()[0], 1)
-    count = math.ceil(nrows / rows_per_partition)
-
-    if count > 1:
-        length = math.ceil(nrows / count)
-        slices = [
-            DataFrameScan(
-                ir.schema,
-                ir.df.slice(offset, length),
-                ir.projection,
-            )
-            for offset in range(0, nrows, length)
-        ]
-        new_node = Union(ir.schema, None, *slices)
-        return new_node, {slice: PartitionInfo(count=1) for slice in slices} | {
-            new_node: PartitionInfo(count=count)
-        }
-
-    return ir, {ir: PartitionInfo(count=1)}
+    return lower_dataframescan_rapidsmpf(ir, rec)
 
 
 def scan_partition_plan(
@@ -285,84 +254,9 @@ def _(
 def _(
     ir: Scan, rec: LowerIRTransformer
 ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
-    partition_info: MutableMapping[IR, PartitionInfo]
-    config_options = rec.state["config_options"]
-
-    # RapidsMPF runtime: Use rapidsmpf-specific lowering
-    if (
-        config_options.executor.name == "streaming"
-        and config_options.executor.runtime == "rapidsmpf"
-    ):  # pragma: no cover; Requires rapidsmpf runtime
-        from cudf_polars.experimental.rapidsmpf.io import lower_scan_rapidsmpf
+    from cudf_polars.experimental.rapidsmpf.io import lower_scan_rapidsmpf
 
-        return lower_scan_rapidsmpf(ir, rec)
-
-    if (
-        ir.typ in ("csv", "parquet", "ndjson")
-        and ir.n_rows == -1
-        and ir.skip_rows == 0
-        and ir.row_index is None
-    ):
-        plan = scan_partition_plan(ir, rec.state["stats"], config_options)
-        paths = list(ir.paths)
-        if plan.flavor == IOPartitionFlavor.SPLIT_FILES:
-            # Disable chunked reader when splitting files
-            parquet_options = dataclasses.replace(
-                config_options.parquet_options,
-                chunked=False,
-            )
-
-            slices: list[SplitScan] = []
-            for path in paths:
-                base_scan = Scan(
-                    ir.schema,
-                    ir.typ,
-                    ir.reader_options,
-                    ir.cloud_options,
-                    [path],
-                    ir.with_columns,
-                    ir.skip_rows,
-                    ir.n_rows,
-                    ir.row_index,
-                    ir.include_file_paths,
-                    ir.predicate,
-                    parquet_options,
-                )
-                slices.extend(
-                    SplitScan(
-                        ir.schema, base_scan, sindex, plan.factor, parquet_options
-                    )
-                    for sindex in range(plan.factor)
-                )
-            new_node = Union(ir.schema, None, *slices)
-            partition_info = {slice: PartitionInfo(count=1) for slice in slices} | {
-                new_node: PartitionInfo(count=len(slices))
-            }
-        else:
-            groups: list[Scan] = [
-                Scan(
-                    ir.schema,
-                    ir.typ,
-                    ir.reader_options,
-                    ir.cloud_options,
-                    paths[i : i + plan.factor],
-                    ir.with_columns,
-                    ir.skip_rows,
-                    ir.n_rows,
-                    ir.row_index,
-                    ir.include_file_paths,
-                    ir.predicate,
-                    config_options.parquet_options,
-                )
-                for i in range(0, len(paths), plan.factor)
-            ]
-            new_node = Union(ir.schema, None, *groups)
-            partition_info = {group: PartitionInfo(count=1) for group in groups} | {
-                new_node: PartitionInfo(count=len(groups))
-            }
-        return new_node, partition_info
-
-    return ir, {ir: PartitionInfo(count=1)}  # pragma: no cover
+    return lower_scan_rapidsmpf(ir, rec)
 
 
 class StreamingSink(IR):
@@ -441,22 +335,6 @@ def _prepare_sink_directory(path: str) -> None:
     Path(path).mkdir(parents=True, exist_ok=True)
 
 
-def _sink_to_directory(
-    schema: Schema,
-    kind: str,
-    path: str,
-    parquet_options: ParquetOptions,
-    options: dict[str, Any],
-    df: DataFrame,
-    ready: None,
-    context: IRExecutionContext,
-) -> DataFrame:
-    """Sink a partition to a new file."""
-    return Sink.do_evaluate(
-        schema, kind, path, parquet_options, options, df, context=context
-    )
-
-
 def _sink_to_parquet_file(
     path: str,
     options: dict[str, Any],
@@ -545,106 +423,6 @@ def _sink_to_file(
     return True
 
 
-def _finalize_file_sink(
-    kind: str,
-    writer_state: Any,
-    df: DataFrame,
-) -> DataFrame:
-    """Finalize the file sink by closing the writer."""
-    if kind == "Parquet" and writer_state is not None:
-        writer_state.close([])
-    return df.slice((0, 0))
-
-
-def _file_sink_graph(
-    ir: StreamingSink,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    context: IRExecutionContext,
-) -> MutableMapping[Any, Any]:
-    """Sink to a single file."""
-    name = get_key_name(ir)
-    count = partition_info[ir].count
-    child_name = get_key_name(ir.children[0])
-    sink = ir.sink
-    if count == 1:
-        return {
-            (name, 0): (
-                partial(sink.do_evaluate, context=context),
-                *sink._non_child_args,
-                (child_name, 0),
-            )
-        }
-
-    sink_name = get_key_name(sink)
-    graph: MutableMapping[Any, Any] = {
-        (sink_name, i): (
-            _sink_to_file,
-            sink.kind,
-            sink.path,
-            sink.options,
-            None if i == 0 else (sink_name, i - 1),  # Writer state
-            (child_name, i),
-        )
-        for i in range(count)
-    }
-
-    # Finalize task closes the writer after all chunks are written
-    graph[(sink_name, "finalize")] = (
-        _finalize_file_sink,
-        sink.kind,
-        (sink_name, count - 1),  # Writer state from last task
-        (child_name, count - 1),  # Last source df for creating empty result
-    )
-
-    # Make sure final tasks point to finalize task
-    graph.update({(name, i): (sink_name, "finalize") for i in range(count)})
-    return graph
-
-
-def _directory_sink_graph(
-    ir: StreamingSink,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    context: IRExecutionContext,
-) -> MutableMapping[Any, Any]:
-    """Sink to a directory of files."""
-    name = get_key_name(ir)
-    count = partition_info[ir].count
-    child_name = get_key_name(ir.children[0])
-    sink = ir.sink
-
-    setup_name = f"setup-{name}"
-    suffix = sink.kind.lower()
-    width = math.ceil(math.log10(count))
-    graph: MutableMapping[Any, Any] = {
-        (name, i): (
-            _sink_to_directory,
-            sink.schema,
-            sink.kind,
-            f"{sink.path}/part.{str(i).zfill(width)}.{suffix}",
-            sink.parquet_options,
-            sink.options,
-            (child_name, i),
-            setup_name,
-            context,
-        )
-        for i in range(count)
-    }
-    graph[setup_name] = (_prepare_sink_directory, sink.path)
-    return graph
-
-
-@generate_ir_tasks.register(StreamingSink)
-def _(
-    ir: StreamingSink,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    context: IRExecutionContext,
-) -> MutableMapping[Any, Any]:
-    if ir.sink_to_directory:
-        return _directory_sink_graph(ir, partition_info, context=context)
-    else:
-        return _file_sink_graph(ir, partition_info, context=context)
-
-
 class ParquetMetadata:
     """
     Parquet metadata container.
diff --git a/python/cudf_polars/cudf_polars/experimental/join.py b/python/cudf_polars/cudf_polars/experimental/join.py
index 47d0ad90d8e..cd5c514b45a 100644
--- a/python/cudf_polars/cudf_polars/experimental/join.py
+++ b/python/cudf_polars/cudf_polars/experimental/join.py
@@ -5,16 +5,15 @@
 from __future__ import annotations
 
 import operator
-from functools import partial, reduce
-from typing import TYPE_CHECKING, Any
+from functools import reduce
+from typing import TYPE_CHECKING
 
 from cudf_polars.dsl.ir import ConditionalJoin, Join, Slice
-from cudf_polars.experimental.base import PartitionInfo, get_key_name
-from cudf_polars.experimental.dispatch import generate_ir_tasks, lower_ir_node
+from cudf_polars.experimental.base import PartitionInfo
+from cudf_polars.experimental.dispatch import lower_ir_node
 from cudf_polars.experimental.repartition import Repartition
-from cudf_polars.experimental.shuffle import Shuffle, _hash_partition_dataframe
+from cudf_polars.experimental.shuffle import Shuffle
 from cudf_polars.experimental.utils import (
-    _concat,
     _dynamic_planning_on,
     _fallback_inform,
     _lower_ir_fallback,
@@ -24,16 +23,14 @@
     from collections.abc import MutableMapping
 
     from cudf_polars.dsl.expr import NamedExpr
-    from cudf_polars.dsl.ir import IR, IRExecutionContext
+    from cudf_polars.dsl.ir import IR
     from cudf_polars.experimental.parallel import LowerIRTransformer
-    from cudf_polars.utils.config import ShuffleMethod
 
 
 def _maybe_shuffle_frame(
     frame: IR,
     on: tuple[NamedExpr, ...],
     partition_info: MutableMapping[IR, PartitionInfo],
-    shuffle_method: ShuffleMethod,
     output_count: int,
 ) -> IR:
     # Shuffle `frame` if it isn't already shuffled.
@@ -48,7 +45,6 @@ def _maybe_shuffle_frame(
         frame = Shuffle(
             frame.schema,
             on,
-            shuffle_method,
             frame,
         )
         partition_info[frame] = PartitionInfo(
@@ -64,21 +60,18 @@ def _make_hash_join(
     partition_info: MutableMapping[IR, PartitionInfo],
     left: IR,
     right: IR,
-    shuffle_method: ShuffleMethod,
 ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
     # Shuffle left and right dataframes (if necessary)
     left = _maybe_shuffle_frame(
         left,
         ir.left_on,
         partition_info,
-        shuffle_method,
         output_count,
     )
     right = _maybe_shuffle_frame(
         right,
         ir.right_on,
         partition_info,
-        shuffle_method,
         output_count,
     )
     # Always reconstruct in case children contain Cache nodes
@@ -146,45 +139,7 @@ def _make_bcast_join(
     partition_info: MutableMapping[IR, PartitionInfo],
     left: IR,
     right: IR,
-    shuffle_method: ShuffleMethod,
-    *,
-    streaming_runtime: str,
 ) -> tuple[IR, MutableMapping[IR, PartitionInfo]]:
-    if ir.options[0] != "Inner":
-        left_count = partition_info[left].count
-        right_count = partition_info[right].count
-
-        # Shuffle the smaller table (if necessary) - Notes:
-        # - We need to shuffle the smaller table if
-        #   (1) we are not doing an "inner" join,
-        #   and (2) the small table contains multiple
-        #   partitions.
-        # - We cannot simply join a large-table partition
-        #   to each small-table partition, and then
-        #   concatenate the partial-join results, because
-        #   a non-"inner" join does NOT commute with
-        #   concatenation.
-        # - In some cases, we can perform the partial joins
-        #   sequentially. However, we are starting with a
-        #   catch-all algorithm that works for all cases.
-        if streaming_runtime == "tasks":
-            if left_count >= right_count:
-                right = _maybe_shuffle_frame(
-                    right,
-                    ir.right_on,
-                    partition_info,
-                    shuffle_method,
-                    right_count,
-                )
-            else:
-                left = _maybe_shuffle_frame(
-                    left,
-                    ir.left_on,
-                    partition_info,
-                    shuffle_method,
-                    left_count,
-                )
-
     new_node = ir.reconstruct([left, right])
     partition_info[new_node] = PartitionInfo(count=output_count)
     return new_node, partition_info
@@ -301,8 +256,6 @@ def _(
             partition_info,
             left,
             right,
-            config_options.executor.shuffle_method,
-            streaming_runtime=config_options.executor.runtime,
         )
     else:
         # Create a hash join
@@ -312,109 +265,4 @@ def _(
             partition_info,
             left,
             right,
-            config_options.executor.shuffle_method,
         )
-
-
-@generate_ir_tasks.register(Join)
-def _(
-    ir: Join,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    context: IRExecutionContext,
-) -> MutableMapping[Any, Any]:
-    left, right = ir.children
-    output_count = partition_info[ir].count
-
-    left_partitioned = (
-        partition_info[left].partitioned_on == ir.left_on
-        and partition_info[left].count == output_count
-    )
-    right_partitioned = (
-        partition_info[right].partitioned_on == ir.right_on
-        and partition_info[right].count == output_count
-    )
-
-    if output_count == 1 or (left_partitioned and right_partitioned):
-        # Partition-wise join
-        left_name = get_key_name(left)
-        right_name = get_key_name(right)
-        return {
-            key: (
-                partial(ir.do_evaluate, context=context),
-                *ir._non_child_args,
-                (left_name, i),
-                (right_name, i),
-            )
-            for i, key in enumerate(partition_info[ir].keys(ir))
-        }
-    else:
-        # Broadcast join
-        left_parts = partition_info[left]
-        right_parts = partition_info[right]
-        if left_parts.count >= right_parts.count:
-            small_side = "Right"
-            small_name = get_key_name(right)
-            small_size = partition_info[right].count
-            large_name = get_key_name(left)
-            large_on = ir.left_on
-        else:
-            small_side = "Left"
-            small_name = get_key_name(left)
-            small_size = partition_info[left].count
-            large_name = get_key_name(right)
-            large_on = ir.right_on
-
-        graph: MutableMapping[Any, Any] = {}
-
-        out_name = get_key_name(ir)
-        out_size = partition_info[ir].count
-        split_name = f"split-{out_name}"
-        getit_name = f"getit-{out_name}"
-        inter_name = f"inter-{out_name}"
-
-        # Split each large partition if we have
-        # multiple small partitions (unless this
-        # is an inner join)
-        split_large = ir.options[0] != "Inner" and small_size > 1
-
-        for part_out in range(out_size):
-            if split_large:
-                graph[(split_name, part_out)] = (
-                    _hash_partition_dataframe,
-                    (large_name, part_out),
-                    part_out,
-                    small_size,
-                    None,
-                    large_on,
-                )
-
-            _concat_list = []
-            for j in range(small_size):
-                left_key: tuple[str, int] | tuple[str, int, int]
-                if split_large:
-                    left_key = (getit_name, part_out, j)
-                    graph[left_key] = (operator.getitem, (split_name, part_out), j)
-                else:
-                    left_key = (large_name, part_out)
-                join_children = [left_key, (small_name, j)]
-                if small_side == "Left":
-                    join_children.reverse()
-
-                inter_key = (inter_name, part_out, j)
-                graph[(inter_name, part_out, j)] = (
-                    partial(ir.do_evaluate, context=context),
-                    ir.left_on,
-                    ir.right_on,
-                    ir.options,
-                    *join_children,
-                )
-                _concat_list.append(inter_key)
-            if len(_concat_list) == 1:
-                graph[(out_name, part_out)] = graph.pop(_concat_list[0])
-            else:
-                graph[(out_name, part_out)] = (
-                    partial(_concat, context=context),
-                    *_concat_list,
-                )
-
-        return graph
diff --git a/python/cudf_polars/cudf_polars/experimental/parallel.py b/python/cudf_polars/cudf_polars/experimental/parallel.py
index f77e923bce0..ab5d3b5bd90 100644
--- a/python/cudf_polars/cudf_polars/experimental/parallel.py
+++ b/python/cudf_polars/cudf_polars/experimental/parallel.py
@@ -4,10 +4,9 @@
 
 from __future__ import annotations
 
-import itertools
 import operator
 from functools import partial, reduce
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING
 
 import polars as pl
 
@@ -26,7 +25,6 @@
     Filter,
     HConcat,
     HStack,
-    IRExecutionContext,
     MapFunction,
     Projection,
     Select,
@@ -35,16 +33,11 @@
 )
 from cudf_polars.dsl.traversal import CachingVisitor, traversal
 from cudf_polars.dsl.utils.naming import unique_names
-from cudf_polars.experimental.base import PartitionInfo, get_key_name
-from cudf_polars.experimental.dispatch import (
-    generate_ir_tasks,
-    lower_ir_node,
-)
+from cudf_polars.experimental.base import PartitionInfo
+from cudf_polars.experimental.dispatch import lower_ir_node
 from cudf_polars.experimental.io import _clear_source_info_cache
 from cudf_polars.experimental.repartition import Repartition
-from cudf_polars.experimental.statistics import collect_statistics
 from cudf_polars.experimental.utils import (
-    _concat,
     _contains_over,
     _dynamic_planning_on,
     _lower_ir_fallback,
@@ -52,7 +45,6 @@
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
-    from typing import Any
 
     from cudf_polars.experimental.base import StatsCollector
     from cudf_polars.experimental.dispatch import LowerIRTransformer, State
@@ -109,63 +101,6 @@ def lower_ir_graph(
     return mapper(ir)
 
 
-def task_graph(
-    ir: IR,
-    partition_info: MutableMapping[IR, PartitionInfo],
-) -> tuple[MutableMapping[Any, Any], str | tuple[str, int]]:
-    """
-    Construct a task graph for evaluation of an IR graph.
-
-    Parameters
-    ----------
-    ir
-        Root of the graph to rewrite.
-    partition_info
-        A mapping from all unique IR nodes to the
-        associated partitioning information.
-
-    Returns
-    -------
-    graph
-        A task graph for the entire IR graph with root `ir`,
-        in dict-of-tuples form consumed by
-        :func:`~cudf_polars.experimental.scheduler.synchronous_scheduler`.
-
-    Notes
-    -----
-    This function traverses the unique nodes of the
-    graph with root `ir`, and extracts the tasks for
-    each node with :func:`generate_ir_tasks`.
-
-    See Also
-    --------
-    generate_ir_tasks
-    """
-    context = IRExecutionContext()
-    graph = reduce(
-        operator.or_,
-        (
-            generate_ir_tasks(node, partition_info, context=context)
-            for node in traversal([ir])
-        ),
-    )
-
-    key_name = get_key_name(ir)
-    partition_count = partition_info[ir].count
-
-    key: str | tuple[str, int]
-    if partition_count > 1:
-        graph[key_name] = (
-            partial(_concat, context=context),
-            *partition_info[ir].keys(ir),
-        )
-        key = key_name
-    else:
-        key = (key_name, 0)
-
-    return graph, key
-
-
 def evaluate_rapidsmpf(
     ir: IR,
     config_options: ConfigOptions[StreamingExecutor],
@@ -211,44 +146,7 @@ def evaluate_streaming(
     # Clear source info cache in case data was overwritten
     _clear_source_info_cache()
 
-    if (
-        config_options.executor.runtime == "rapidsmpf"
-    ):  # pragma: no cover; rapidsmpf runtime not tested in CI yet
-        # Using the RapidsMPF streaming runtime.
-        return evaluate_rapidsmpf(ir, config_options)
-    else:
-        # Using the default task engine.
-        from cudf_polars.experimental.scheduler import synchronous_scheduler
-
-        stats = collect_statistics(ir, config_options)
-        ir, partition_info = lower_ir_graph(ir, config_options, stats)
-
-        graph, key = task_graph(ir, partition_info)
-
-        return synchronous_scheduler(graph, key).to_polars()
-
-
-@generate_ir_tasks.register(IR)
-def _(
-    ir: IR,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    context: IRExecutionContext,
-) -> MutableMapping[Any, Any]:
-    # Generate pointwise (embarrassingly-parallel) tasks by default
-    child_names = [get_key_name(c) for c in ir.children]
-    bcast_child = [partition_info[c].count == 1 for c in ir.children]
-
-    return {
-        key: (
-            partial(ir.do_evaluate, context=context),
-            *ir._non_child_args,
-            *[
-                (child_name, 0 if bcast_child[j] else i)
-                for j, child_name in enumerate(child_names)
-            ],
-        )
-        for i, key in enumerate(partition_info[ir].keys(ir))
-    }
+    return evaluate_rapidsmpf(ir, config_options)
 
 
 @lower_ir_node.register(Union)
@@ -278,21 +176,6 @@ def _(
     return new_node, partition_info
 
 
-@generate_ir_tasks.register(Union)
-def _(
-    ir: Union,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    context: IRExecutionContext,
-) -> MutableMapping[Any, Any]:
-    key_name = get_key_name(ir)
-    partition = itertools.count()
-    return {
-        (key_name, next(partition)): child_key
-        for child in ir.children
-        for child_key in partition_info[child].keys(child)
-    }
-
-
 @lower_ir_node.register(MapFunction)
 def _(
     ir: MapFunction, rec: LowerIRTransformer
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/core.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/core.py
index 478c0a33beb..97168f0b02d 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/core.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/core.py
@@ -99,8 +99,6 @@ def evaluate_logical_plan(
     -------
     The output DataFrame and metadata collector.
     """
-    assert config_options.executor.runtime == "rapidsmpf", "Runtime must be rapidsmpf"
-
     query_id = uuid.uuid4()
 
     with cudf_polars.dsl.tracing.bound_contextvars(
@@ -202,8 +200,6 @@ def evaluate_pipeline(
     -------
     The output DataFrame and metadata collector.
     """
-    assert config_options.executor.runtime == "rapidsmpf", "Runtime must be rapidsmpf"
-
     _original_mr: Any = None
     use_stream_pool = False
     if rmpf_context is not None:
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/core.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/core.py
index 7bc8dabddec..26ad95198f6 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/core.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/core.py
@@ -436,7 +436,7 @@ def execute_ir_on_rank(
 
 
 _RESERVED_EXECUTOR_KEYS: frozenset[str] = frozenset(
-    {"runtime", "cluster", "spmd_context", "ray_context", "dask_context"}
+    {"cluster", "spmd_context", "ray_context", "dask_context"}
 )
 _RESERVED_ENGINE_KEYS: frozenset[str] = frozenset({"memory_resource", "executor"})
 
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/dask.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/dask.py
index 49810e998fd..b4300346132 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/dask.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/dask.py
@@ -688,7 +688,6 @@ def __init__(
             nranks=nranks,
             executor_options={
                 **executor_options,
-                "runtime": "rapidsmpf",
                 "cluster": "dask",
                 "dask_context": dask_ctx,
             },
@@ -736,7 +735,6 @@ def _reset(
             nranks=self._nranks,
             executor_options={
                 **executor_options,
-                "runtime": "rapidsmpf",
                 "cluster": "dask",
                 "dask_context": ctx,
             },
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/options.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/options.py
index d8464aa7426..c7650bff513 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/options.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/options.py
@@ -234,11 +234,6 @@ class StreamingOptions:
         Env: ``CUDF_POLARS__EXECUTOR__DYNAMIC_PLANNING``.
         Default: enabled.
         Category: executor.
-    unique_fraction
-        Per-column uniqueness estimate (0-1). Defaults to ``1.0``.
-        Env: ``CUDF_POLARS__EXECUTOR__UNIQUE_FRACTION``.
-        Default: ``{}``.
-        Category: executor.
     sink_to_directory
         Whether multi-partition sink operations should write to a directory
         rather than a single file. The ``spmd``/``ray``/``dask`` engines
@@ -332,9 +327,6 @@ class StreamingOptions:
     dynamic_planning: dict[str, Any] | DynamicPlanningOptions | None | Unspecified = (
         _opt("executor")
     )
-    unique_fraction: dict[str, float] | Unspecified = _opt(
-        "executor", "CUDF_POLARS__EXECUTOR__UNIQUE_FRACTION", json.loads
-    )
     sink_to_directory: bool | Unspecified = _opt(
         "executor", "CUDF_POLARS__EXECUTOR__SINK_TO_DIRECTORY", parse_boolean
     )
@@ -515,7 +507,6 @@ def _get(attr: str) -> Any:
             broadcast_join_limit=_get("broadcast_join_limit"),
             target_partition_size=target_partition_size,
             dynamic_planning=dynamic_planning,
-            unique_fraction=_get("unique_fraction"),
             raise_on_fail=_get("raise_on_fail"),
             parquet_options=_get("parquet_options"),
             memory_resource_config=_get("memory_resource_config"),
@@ -711,15 +702,6 @@ def _add_cli_args(parser: argparse.ArgumentParser) -> None:
                 Enable dynamic planning. Use --no-dynamic-planning to disable.
                 Env: CUDF_POLARS__EXECUTOR__DYNAMIC_PLANNING. Built-in default: enabled."""),
         )
-        g.add_argument(
-            "--unique-fraction",
-            dest="unique_fraction",
-            default=None,
-            type=json.loads,
-            help=textwrap.dedent("""\
-                Per-column uniqueness estimate as a JSON object (e.g. '{"col": 0.5}').
-                Env: CUDF_POLARS__EXECUTOR__UNIQUE_FRACTION. Built-in default: {}."""),
-        )
         g.add_argument(
             "--stream-policy",
             dest="stream_policy",
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/ray.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/ray.py
index 1ba92de3e49..efbb1db9ad4 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/ray.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/ray.py
@@ -90,13 +90,9 @@ def evaluate_pipeline_ray_mode(
 
     Raises
     ------
-    RuntimeError
-        If the configured executor runtime is not ``"rapidsmpf"``.
     RuntimeError
         If ``config_options.executor.ray_context`` is not set.
     """
-    if config_options.executor.runtime != "rapidsmpf":
-        raise RuntimeError("Runtime must be rapidsmpf")
     if config_options.executor.ray_context is None:
         raise RuntimeError("ray_context must be set when cluster='ray'")
     rank_actors = config_options.executor.ray_context.rank_actors
@@ -586,7 +582,6 @@ def __init__(
                 nranks=nranks,
                 executor_options={
                     **executor_options,
-                    "runtime": "rapidsmpf",
                     "cluster": "ray",
                     "ray_context": RayContext(rank_actors),
                 },
@@ -641,7 +636,6 @@ def _reset(
             nranks=len(self._rank_actors),
             executor_options={
                 **executor_options,
-                "runtime": "rapidsmpf",
                 "cluster": "ray",
                 "ray_context": RayContext(self._rank_actors),
             },
diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/spmd.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/spmd.py
index 65e3eb8b1e7..7e1bde808cd 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/spmd.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/frontend/spmd.py
@@ -23,12 +23,11 @@
 from rapidsmpf.statistics import Statistics
 from rapidsmpf.streaming.core.context import Context
 
-import polars as pl
-
 import pylibcudf as plc
 import rmm.mr
 from pylibcudf.contiguous_split import pack
 
+from cudf_polars.containers import DataFrame, DataType
 from cudf_polars.experimental.rapidsmpf.collectives.common import reserve_op_id
 from cudf_polars.experimental.rapidsmpf.frontend.core import (
     ClusterInfo,
@@ -53,6 +52,8 @@
     from rapidsmpf.config import Options
     from rapidsmpf.streaming.cudf.channel_metadata import ChannelMetadata
 
+    import polars as pl
+
     from cudf_polars.dsl.ir import IR
     from cudf_polars.experimental.parallel import ConfigOptions
     from cudf_polars.experimental.rapidsmpf.frontend.core import T
@@ -98,8 +99,6 @@ def evaluate_pipeline_spmd_mode(
     The concatenated output DataFrame and, if ``collect_metadata`` is
     True, the list of channel metadata objects; otherwise ``None``.
     """
-    if config_options.executor.runtime != "rapidsmpf":
-        raise RuntimeError("Runtime must be rapidsmpf")
     if config_options.executor.spmd_context is None:
         raise RuntimeError("spmd_context must be set for SPMD mode")
     comm = config_options.executor.spmd_context.comm
@@ -155,8 +154,9 @@ def allgather_polars_dataframe(
     ctx = engine.context
     stream = ctx.get_stream_from_pool()
     col_names = local_df.columns
+    dtypes = [DataType(dtype) for dtype in local_df.dtypes]
 
-    plc_table = plc.Table.from_arrow(local_df.to_arrow())
+    plc_table = plc.Table.from_arrow(local_df, stream=stream)
 
     packed_data = PackedData.from_cudf_packed_columns(
         pack(plc_table, stream),
@@ -176,9 +176,12 @@ def allgather_polars_dataframe(
     plc_result = unpack_and_concat(results, stream, ctx.br())
 
     # pylibcudf Table -> pl.DataFrame (restore column names)
-    ret = pl.from_arrow(plc_result.to_arrow(col_names))
-    assert isinstance(ret, pl.DataFrame)
-    return ret
+    return DataFrame.from_table(
+        plc_result,
+        col_names,
+        dtypes,
+        stream,
+    ).to_polars()
 
 
 class SPMDEngine(StreamingEngine):
@@ -389,7 +392,6 @@ def __init__(
                 nranks=comm.nranks,
                 executor_options={
                     **executor_options,
-                    "runtime": "rapidsmpf",
                     "cluster": "spmd",
                     "spmd_context": SPMDContext(
                         comm=comm, context=ctx, py_executor=self._py_executor
@@ -494,7 +496,6 @@ def _reset(
             nranks=self._comm.nranks,
             executor_options={
                 **executor_options,
-                "runtime": "rapidsmpf",
                 "cluster": "spmd",
                 "spmd_context": SPMDContext(
                     comm=self._comm,
diff --git a/python/cudf_polars/cudf_polars/experimental/repartition.py b/python/cudf_polars/cudf_polars/experimental/repartition.py
index 92d89a5f44c..84c39d930ca 100644
--- a/python/cudf_polars/cudf_polars/experimental/repartition.py
+++ b/python/cudf_polars/cudf_polars/experimental/repartition.py
@@ -4,20 +4,11 @@
 
 from __future__ import annotations
 
-import itertools
-from functools import partial
-from typing import TYPE_CHECKING, Any
+from typing import TYPE_CHECKING
 
 from cudf_polars.dsl.ir import IR
-from cudf_polars.experimental.base import get_key_name
-from cudf_polars.experimental.dispatch import generate_ir_tasks
-from cudf_polars.experimental.utils import _concat
 
 if TYPE_CHECKING:
-    from collections.abc import MutableMapping
-
-    from cudf_polars.dsl.ir import IRExecutionContext
-    from cudf_polars.experimental.parallel import PartitionInfo
     from cudf_polars.typing import Schema
 
 
@@ -43,35 +34,3 @@ def __init__(self, schema: Schema, df: IR):
         self.schema = schema
         self._non_child_args = ()
         self.children = (df,)
-
-
-@generate_ir_tasks.register(Repartition)
-def _(
-    ir: Repartition,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    context: IRExecutionContext,
-) -> MutableMapping[Any, Any]:
-    # Repartition an IR node.
-    # Only supports rapartitioning to fewer (for now).
-
-    (child,) = ir.children
-    count_in = partition_info[child].count
-    count_out = partition_info[ir].count
-
-    if count_out > count_in:  # pragma: no cover
-        raise NotImplementedError(
-            f"Repartition {count_in} -> {count_out} not supported."
-        )
-
-    key_name = get_key_name(ir)
-    n, remainder = divmod(count_in, count_out)
-    # Spread remainder evenly over the partitions.
-    offsets = [0, *itertools.accumulate(n + (i < remainder) for i in range(count_out))]
-    child_keys = tuple(partition_info[child].keys(child))
-    return {
-        (key_name, i): (
-            partial(_concat, context=context),
-            *child_keys[offsets[i] : offsets[i + 1]],
-        )
-        for i in range(count_out)
-    }
diff --git a/python/cudf_polars/cudf_polars/experimental/scheduler.py b/python/cudf_polars/cudf_polars/experimental/scheduler.py
deleted file mode 100644
index 97eae6ab378..00000000000
--- a/python/cudf_polars/cudf_polars/experimental/scheduler.py
+++ /dev/null
@@ -1,153 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-"""Synchronous task scheduler."""
-
-from __future__ import annotations
-
-from collections import Counter
-from collections.abc import MutableMapping
-from itertools import chain
-from typing import TYPE_CHECKING, Any, TypeVar, Unpack
-
-if TYPE_CHECKING:
-    from collections.abc import Mapping
-    from typing import TypeAlias
-
-
-Key: TypeAlias = str | tuple[str, Unpack[tuple[int, ...]]]
-Graph: TypeAlias = MutableMapping[Key, Any]
-T_ = TypeVar("T_")
-
-
-# NOTE: This is a slimmed-down version of the single-threaded
-# (synchronous) scheduler in `dask.core`.
-#
-# Key Differences:
-# * We do not allow a task to contain a list of key names.
-#   Keys must be distinct elements of the task.
-# * We do not support nested tasks.
-
-
-def istask(x: Any) -> bool:
-    """Check if x is a callable task."""
-    return isinstance(x, tuple) and bool(x) and callable(x[0])
-
-
-def is_hashable(x: Any) -> bool:
-    """Check if x is hashable."""
-    try:
-        hash(x)
-    except BaseException:
-        return False
-    else:
-        return True
-
-
-def _execute_task(arg: Any, cache: Mapping) -> Any:
-    """Execute a compute task."""
-    if istask(arg):
-        return arg[0](*(_execute_task(a, cache) for a in arg[1:]))
-    elif is_hashable(arg):
-        return cache.get(arg, arg)
-    else:
-        return arg
-
-
-def required_keys(key: Key, graph: Graph) -> list[Key]:
-    """
-    Return the dependencies to extract a key from the graph.
-
-    Parameters
-    ----------
-    key
-        Root key we want to extract.
-    graph
-        The full task graph.
-
-    Returns
-    -------
-    List of other keys needed to extract ``key``.
-    """
-    maybe_task = graph[key]
-    return [
-        k
-        for k in (
-            maybe_task[1:]
-            if istask(maybe_task)
-            else [maybe_task]  # maybe_task might be a key
-        )
-        if is_hashable(k) and k in graph
-    ]
-
-
-def toposort(graph: Graph, dependencies: Mapping[Key, list[Key]]) -> list[Key]:
-    """Return a list of task keys sorted in topological order."""
-    # Stack-based depth-first search traversal. This is based on Tarjan's
-    # algorithm for strongly-connected components
-    # (https://en.wikipedia.org/wiki/Tarjan%27s_strongly_connected_components_algorithm)
-    ordered: list[Key] = []
-    completed: set[Key] = set()
-
-    for key in graph:
-        if key in completed:
-            continue
-        nodes = [key]
-        while nodes:
-            # Keep current node on the stack until all descendants are visited
-            current = nodes[-1]
-            if current in completed:  # pragma: no cover
-                # Already fully traversed descendants of current
-                nodes.pop()
-                continue
-
-            # Add direct descendants of current to nodes stack
-            next_nodes = set(dependencies[current]) - completed
-            if next_nodes:
-                nodes.extend(next_nodes)
-            else:
-                # Current has no more descendants to explore
-                ordered.append(current)
-                completed.add(current)
-                nodes.pop()
-
-    return ordered
-
-
-def synchronous_scheduler(
-    graph: Graph,
-    key: Key,
-    *,
-    cache: MutableMapping | None = None,
-) -> Any:
-    """
-    Execute the task graph for a given key.
-
-    Parameters
-    ----------
-    graph
-        The task graph to execute.
-    key
-        The final output key to extract from the graph.
-    cache
-        Intermediate-data cache.
-
-    Returns
-    -------
-    Executed task-graph result for ``key``.
-    """
-    if key not in graph:  # pragma: no cover
-        raise KeyError(f"{key} is not a key in the graph")
-    if cache is None:
-        cache = {}
-
-    dependencies = {k: required_keys(k, graph) for k in graph}
-    refcount = Counter(chain.from_iterable(dependencies.values()))
-
-    for k in toposort(graph, dependencies):
-        cache[k] = _execute_task(graph[k], cache)
-        for dep in dependencies[k]:
-            refcount[dep] -= 1
-            if refcount[dep] == 0 and dep != key:
-                del cache[dep]
-
-    return cache[key]
diff --git a/python/cudf_polars/cudf_polars/experimental/shuffle.py b/python/cudf_polars/cudf_polars/experimental/shuffle.py
index 8e24dd83fe6..9381126775f 100644
--- a/python/cudf_polars/cudf_polars/experimental/shuffle.py
+++ b/python/cudf_polars/cudf_polars/experimental/shuffle.py
@@ -4,112 +4,22 @@
 
 from __future__ import annotations
 
-import operator
-from functools import partial
-from typing import TYPE_CHECKING, Any, Concatenate, TypeVar, TypedDict
+from typing import TYPE_CHECKING
 
-import pylibcudf as plc
-from rmm.pylibrmm.stream import DEFAULT_STREAM
-
-from cudf_polars.containers import DataFrame
-from cudf_polars.dsl.expr import Col
 from cudf_polars.dsl.ir import IR
-from cudf_polars.dsl.tracing import log_do_evaluate, nvtx_annotate_cudf_polars
-from cudf_polars.experimental.base import get_key_name
-from cudf_polars.experimental.dispatch import generate_ir_tasks, lower_ir_node
-from cudf_polars.experimental.utils import _concat, _dynamic_planning_on
-from cudf_polars.utils.cuda_stream import get_dask_cuda_stream
+from cudf_polars.dsl.tracing import log_do_evaluate
+from cudf_polars.experimental.dispatch import lower_ir_node
+from cudf_polars.experimental.utils import _dynamic_planning_on
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, MutableMapping, Sequence
+    from collections.abc import MutableMapping
 
-    from cudf_polars.containers import DataType
+    from cudf_polars.containers import DataFrame
     from cudf_polars.dsl.expr import NamedExpr
     from cudf_polars.dsl.ir import IRExecutionContext
     from cudf_polars.experimental.dispatch import LowerIRTransformer
     from cudf_polars.experimental.parallel import PartitionInfo
     from cudf_polars.typing import Schema
-    from cudf_polars.utils.config import ShuffleMethod
-
-
-# Supported shuffle methods
-_SHUFFLE_METHODS = ("rapidsmpf", "tasks")
-
-
-class ShuffleOptions(TypedDict):
-    """RapidsMPF shuffling options."""
-
-    on: Sequence[str]
-    column_names: Sequence[str]
-    dtypes: Sequence[DataType]
-
-
-# Experimental rapidsmpf shuffler integration
-class RMPFIntegration:  # pragma: no cover
-    """cuDF-Polars protocol for rapidsmpf shuffler."""
-
-    @staticmethod
-    @nvtx_annotate_cudf_polars(message="RMPFIntegration.insert_partition")
-    def insert_partition(
-        df: DataFrame,
-        partition_id: int,  # Not currently used
-        partition_count: int,
-        shuffler: Any,
-        options: ShuffleOptions,
-        *other: Any,
-    ) -> None:
-        """Add cudf-polars DataFrame chunks to an RMP shuffler."""
-        from rapidsmpf.integrations.cudf.partition import partition_and_pack
-        from rapidsmpf.integrations.single import get_worker_context
-
-        context = get_worker_context()
-
-        on = options["on"]
-        assert not other, f"Unexpected arguments: {other}"
-        columns_to_hash = tuple(df.column_names.index(val) for val in on)
-        packed_inputs = partition_and_pack(
-            df.table,
-            columns_to_hash=columns_to_hash,
-            num_partitions=partition_count,
-            br=context.br,
-            stream=DEFAULT_STREAM,
-        )
-
-        shuffler.insert_chunks(packed_inputs)
-
-    @staticmethod
-    @nvtx_annotate_cudf_polars(message="RMPFIntegration.extract_partition")
-    def extract_partition(
-        partition_id: int,
-        shuffler: Any,
-        options: ShuffleOptions,
-    ) -> DataFrame:
-        """Extract a finished partition from the RMP shuffler."""
-        from rapidsmpf.integrations.cudf.partition import (
-            unpack_and_concat,
-            unspill_partitions,
-        )
-        from rapidsmpf.integrations.single import get_worker_context
-
-        context = get_worker_context()
-
-        shuffler.wait()
-        column_names = options["column_names"]
-        dtypes = options["dtypes"]
-        return DataFrame.from_table(
-            unpack_and_concat(
-                unspill_partitions(
-                    shuffler.extract(partition_id),
-                    br=context.br,
-                    allow_overbooking=True,
-                ),
-                br=context.br,
-                stream=DEFAULT_STREAM,
-            ),
-            column_names,
-            dtypes,
-            get_dask_cuda_stream(),
-        )
 
 
 class Shuffle(IR):
@@ -118,29 +28,27 @@ class Shuffle(IR):
 
     Notes
     -----
-    Only hash-based partitioning is supported (for now).  See
-    `ShuffleSorted` for sorting-based shuffling.
+    Only hash-based partitioning is supported (for now).
     """
 
-    __slots__ = ("keys", "shuffle_method")
-    _non_child = ("schema", "keys", "shuffle_method")
-    _n_non_child_args = 3
+    __slots__ = ("keys",)
+    _non_child = (
+        "schema",
+        "keys",
+    )
+    _n_non_child_args = 2
     keys: tuple[NamedExpr, ...]
     """Keys to shuffle on."""
-    shuffle_method: ShuffleMethod
-    """Shuffle method to use."""
 
     def __init__(
         self,
         schema: Schema,
         keys: tuple[NamedExpr, ...],
-        shuffle_method: ShuffleMethod,
         df: IR,
     ):
         self.schema = schema
         self.keys = keys
-        self.shuffle_method = shuffle_method
-        self._non_child_args = (schema, keys, shuffle_method)
+        self._non_child_args = (schema, keys)
         self.children = (df,)
 
     # the type-ignore is for
@@ -153,7 +61,6 @@ def do_evaluate(
         cls,
         schema: Schema,
         keys: tuple[NamedExpr, ...],
-        shuffle_method: ShuffleMethod,
         df: DataFrame,
         *,
         context: IRExecutionContext,
@@ -163,120 +70,6 @@ def do_evaluate(
         return df
 
 
-@nvtx_annotate_cudf_polars(message="Shuffle")
-def _hash_partition_dataframe(
-    df: DataFrame,
-    partition_id: int,  # Used only by sorted shuffling
-    partition_count: int,
-    options: MutableMapping[str, Any] | None,  # No options required
-    on: tuple[NamedExpr, ...],
-) -> dict[int, DataFrame]:
-    """
-    Partition an input DataFrame for hash-based shuffling.
-
-    Parameters
-    ----------
-    df
-        DataFrame to partition.
-    partition_id
-        Partition index (unused for hash partitioning).
-    partition_count
-        Total number of output partitions.
-    options
-        Options (unused for hash partitioning).
-    on
-        Expressions used for the hash partitioning.
-
-    Returns
-    -------
-    A dictionary mapping between int partition indices and
-    DataFrame fragments.
-    """
-    assert not options, f"Expected no options, got: {options}"
-
-    if df.num_rows == 0:
-        # Fast path for empty DataFrame
-        return dict.fromkeys(range(partition_count), df)
-
-    # Hash the specified keys to calculate the output
-    # partition for each row
-    partition_map = plc.binaryop.binary_operation(
-        plc.hashing.murmurhash3_x86_32(
-            DataFrame([expr.evaluate(df) for expr in on], stream=df.stream).table,
-            stream=df.stream,
-        ),
-        plc.Scalar.from_py(
-            partition_count, plc.DataType(plc.TypeId.UINT32), stream=df.stream
-        ),
-        plc.binaryop.BinaryOperator.PYMOD,
-        plc.types.DataType(plc.types.TypeId.UINT32),
-        stream=df.stream,
-    )
-
-    # Apply partitioning
-    t, offsets = plc.partitioning.partition(
-        df.table,
-        partition_map,
-        partition_count,
-        stream=df.stream,
-    )
-    splits = offsets[1:-1]
-
-    # Split and return the partitioned result
-    return {
-        i: DataFrame.from_table(
-            split,
-            df.column_names,
-            df.dtypes,
-            df.stream,
-        )
-        for i, split in enumerate(plc.copying.split(t, splits, stream=df.stream))
-    }
-
-
-# When dropping Python 3.10, can use _simple_shuffle_graph[OPT_T](...)
-OPT_T = TypeVar("OPT_T")
-
-
-def _simple_shuffle_graph(
-    name_in: str,
-    name_out: str,
-    count_in: int,
-    count_out: int,
-    _partition_dataframe_func: Callable[
-        Concatenate[DataFrame, int, int, OPT_T, ...],
-        MutableMapping[int, DataFrame],
-    ],
-    options: OPT_T,
-    *other: Any,
-    context: IRExecutionContext,
-) -> MutableMapping[Any, Any]:
-    """Make a simple all-to-all shuffle graph."""
-    split_name = f"split-{name_out}"
-    inter_name = f"inter-{name_out}"
-
-    graph: MutableMapping[Any, Any] = {}
-    for part_out in range(count_out):
-        _concat_list = []
-        for part_in in range(count_in):
-            graph[(split_name, part_in)] = (
-                _partition_dataframe_func,
-                (name_in, part_in),
-                part_in,
-                count_out,
-                options,
-                *other,
-            )
-            _concat_list.append((inter_name, part_out, part_in))
-            graph[_concat_list[-1]] = (
-                operator.getitem,
-                (split_name, part_in),
-                part_out,
-            )
-        graph[(name_out, part_out)] = (partial(_concat, context=context), *_concat_list)
-    return graph
-
-
 @lower_ir_node.register(Shuffle)
 def _(
     ir: Shuffle, rec: LowerIRTransformer
@@ -306,47 +99,3 @@ def _(
         partitioned_on=ir.keys,
     )
     return new_node, pi
-
-
-@generate_ir_tasks.register(Shuffle)
-def _(
-    ir: Shuffle,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    context: IRExecutionContext,
-) -> MutableMapping[Any, Any]:
-    # Extract "shuffle_method" configuration
-    shuffle_method = ir.shuffle_method
-
-    # Try using rapidsmpf shuffler if we have "simple" shuffle
-    # keys, and the "shuffle_method" config is set to "rapidsmpf-single".
-    _keys: list[Col]
-    if shuffle_method == "rapidsmpf-single" and len(
-        _keys := [ne.value for ne in ir.keys if isinstance(ne.value, Col)]
-    ) == len(ir.keys):  # pragma: no cover
-        from rapidsmpf.integrations.single import rapidsmpf_shuffle_graph
-
-        shuffle_on = [k.name for k in _keys]
-
-        return rapidsmpf_shuffle_graph(
-            get_key_name(ir.children[0]),
-            get_key_name(ir),
-            partition_info[ir.children[0]].count,
-            partition_info[ir].count,
-            RMPFIntegration,
-            {
-                "on": shuffle_on,
-                "column_names": list(ir.schema.keys()),
-                "dtypes": list(ir.schema.values()),
-            },
-        )
-
-    # Simple task-based fall-back
-    return partial(_simple_shuffle_graph, context=context)(
-        get_key_name(ir.children[0]),
-        get_key_name(ir),
-        partition_info[ir.children[0]].count,
-        partition_info[ir].count,
-        _hash_partition_dataframe,
-        None,
-        ir.keys,
-    )
diff --git a/python/cudf_polars/cudf_polars/experimental/sort.py b/python/cudf_polars/cudf_polars/experimental/sort.py
index 6800fb4ab74..fa610324c2d 100644
--- a/python/cudf_polars/cudf_polars/experimental/sort.py
+++ b/python/cudf_polars/cudf_polars/experimental/sort.py
@@ -4,47 +4,30 @@
 
 from __future__ import annotations
 
-from functools import partial
-from typing import TYPE_CHECKING, Any, TypedDict
+from typing import TYPE_CHECKING
 
 import polars as pl
 
 import pylibcudf as plc
-from rmm.pylibrmm.stream import DEFAULT_STREAM
 
 from cudf_polars.containers import Column, DataFrame, DataType
 from cudf_polars.dsl.expr import Col
-from cudf_polars.dsl.ir import IR, Slice, Sort
+from cudf_polars.dsl.ir import Slice, Sort
 from cudf_polars.dsl.traversal import traversal
 from cudf_polars.dsl.utils.naming import unique_names
-from cudf_polars.experimental.base import PartitionInfo, get_key_name
-from cudf_polars.experimental.dispatch import (
-    generate_ir_tasks,
-    lower_ir_node,
-)
-from cudf_polars.experimental.repartition import Repartition
-from cudf_polars.experimental.shuffle import _simple_shuffle_graph
+from cudf_polars.experimental.dispatch import lower_ir_node
 from cudf_polars.experimental.utils import (
-    _concat,
-    _fallback_inform,
     _lower_ir_fallback,
 )
-from cudf_polars.utils.config import ShuffleMethod
-from cudf_polars.utils.cuda_stream import (
-    get_dask_cuda_stream,
-    get_joined_cuda_stream,
-    join_cuda_streams,
-)
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping, Sequence
 
     from rmm.pylibrmm.stream import Stream
 
-    from cudf_polars.dsl.expr import NamedExpr
-    from cudf_polars.dsl.ir import IRExecutionContext
+    from cudf_polars.dsl.ir import IR
+    from cudf_polars.experimental.base import PartitionInfo
     from cudf_polars.experimental.dispatch import LowerIRTransformer
-    from cudf_polars.typing import Schema
 
 
 def find_sort_splits(
@@ -251,248 +234,6 @@ def _get_final_sort_boundaries(
     )
 
 
-def _sort_boundaries_graph(
-    name_in: str,
-    by: Sequence[str],
-    column_order: Sequence[plc.types.Order],
-    null_order: Sequence[plc.types.NullOrder],
-    count: int,
-    context: IRExecutionContext,
-) -> tuple[str, MutableMapping[Any, Any]]:
-    """Graph to get the boundaries from all partitions."""
-    local_boundaries_name = f"sort-boundaries_local-{name_in}"
-    concat_boundaries_name = f"sort-boundaries-concat-{name_in}"
-    global_boundaries_name = f"sort-boundaries-{name_in}"
-    graph: MutableMapping[Any, Any] = {}
-
-    _concat_list = []
-    for part_id in range(count):
-        graph[(local_boundaries_name, part_id)] = (
-            _select_local_split_candidates,
-            (name_in, part_id),
-            by,
-            count,
-            part_id,
-        )
-        _concat_list.append((local_boundaries_name, part_id))
-
-    graph[concat_boundaries_name] = (partial(_concat, context=context), *_concat_list)
-    graph[global_boundaries_name] = (
-        _get_final_sort_boundaries,
-        concat_boundaries_name,
-        column_order,
-        null_order,
-        count,
-    )
-    return global_boundaries_name, graph
-
-
-class SortedShuffleOptions(TypedDict):
-    """RapidsMPF shuffling options."""
-
-    by: Sequence[str]
-    order: Sequence[plc.types.Order]
-    null_order: Sequence[plc.types.NullOrder]
-    column_names: Sequence[str]
-    column_dtypes: Sequence[DataType]
-
-
-# Experimental rapidsmpf shuffler integration
-class RMPFIntegrationSortedShuffle:  # pragma: no cover
-    """cuDF-Polars protocol for rapidsmpf shuffler."""
-
-    @staticmethod
-    def insert_partition(
-        df: DataFrame,
-        partition_id: int,
-        partition_count: int,
-        shuffler: Any,
-        options: SortedShuffleOptions,
-        sort_boundaries: DataFrame,
-    ) -> None:
-        """Add cudf-polars DataFrame chunks to an RMP shuffler."""
-        from rapidsmpf.integrations.cudf.partition import split_and_pack
-        from rapidsmpf.integrations.single import get_worker_context
-
-        context = get_worker_context()
-
-        by = options["by"]
-        data_streams = [
-            df.stream,
-            sort_boundaries.stream,
-        ]
-        stream = get_joined_cuda_stream(get_dask_cuda_stream, upstreams=data_streams)
-
-        splits = find_sort_splits(
-            df.select(by).table,
-            sort_boundaries.table,
-            partition_id,
-            options["order"],
-            options["null_order"],
-            stream=stream,
-        )
-        packed_inputs = split_and_pack(
-            df.table,
-            splits=splits,
-            br=context.br,
-            stream=stream,
-        )
-        # TODO: figure out handoff with rapidsmpf
-        # https://github.com/rapidsai/cudf/issues/20337
-        shuffler.insert_chunks(packed_inputs)
-
-        join_cuda_streams(downstreams=data_streams, upstreams=[stream])
-
-    @staticmethod
-    def extract_partition(
-        partition_id: int,
-        shuffler: Any,
-        options: SortedShuffleOptions,
-    ) -> DataFrame:
-        """Extract a finished partition from the RMP shuffler."""
-        from rapidsmpf.integrations.cudf.partition import (
-            unpack_and_concat,
-            unspill_partitions,
-        )
-        from rapidsmpf.integrations.single import get_worker_context
-
-        context = get_worker_context()
-
-        shuffler.wait()
-        column_names = options["column_names"]
-        column_dtypes = options["column_dtypes"]
-
-        stream = DEFAULT_STREAM
-
-        # TODO: When sorting, this step should finalize with a merge (unless we
-        # require stability, as cudf merge is not stable).
-        # TODO: figure out handoff with rapidsmpf
-        # https://github.com/rapidsai/cudf/issues/20337
-        return DataFrame.from_table(
-            unpack_and_concat(
-                unspill_partitions(
-                    shuffler.extract(partition_id),
-                    br=context.br,
-                    allow_overbooking=True,
-                ),
-                br=context.br,
-                stream=stream,
-            ),
-            column_names,
-            column_dtypes,
-            stream=stream,
-        )
-
-
-def _sort_partition_dataframe(
-    df: DataFrame,
-    partition_id: int,  # Not currently used
-    partition_count: int,
-    options: MutableMapping[str, Any],
-    sort_boundaries: DataFrame,
-) -> MutableMapping[int, DataFrame]:
-    """
-    Partition a sorted DataFrame for shuffling.
-
-    Parameters
-    ----------
-    df
-        The DataFrame to partition.
-    partition_id
-        The partition id of the current partition.
-    partition_count
-        The total number of partitions.
-    options
-        The sort options ``(by, order, null_order)``.
-    sort_boundaries
-        The global sort boundary candidates used to decide where to split.
-    """
-    if df.num_rows == 0:  # pragma: no cover
-        # Fast path for empty DataFrame
-        return dict.fromkeys(range(partition_count), df)
-
-    stream = get_joined_cuda_stream(
-        get_dask_cuda_stream, upstreams=(df.stream, sort_boundaries.stream)
-    )
-
-    splits = find_sort_splits(
-        df.select(options["by"]).table,
-        sort_boundaries.table,
-        partition_id,
-        options["order"],
-        options["null_order"],
-        stream=stream,
-    )
-
-    # Split and return the partitioned result
-    return {
-        i: DataFrame.from_table(
-            split,
-            df.column_names,
-            df.dtypes,
-            stream=df.stream,
-        )
-        for i, split in enumerate(plc.copying.split(df.table, splits, stream=stream))
-    }
-
-
-class ShuffleSorted(IR):
-    """
-    Shuffle already locally sorted multi-partition data.
-
-    Shuffling is performed by extracting sort boundary candidates from all partitions,
-    sharing them all-to-all and then exchanging data accordingly.
-    The sorting information is required to be passed in identically to the already
-    performed local sort and as of now the final result needs to be sorted again to
-    merge the partitions.
-    """
-
-    __slots__ = ("by", "null_order", "order", "shuffle_method")
-    _non_child = ("schema", "by", "order", "null_order", "shuffle_method")
-    _n_non_child_args = 5
-    by: tuple[NamedExpr, ...]
-    """Keys by which the data was sorted."""
-    order: tuple[plc.types.Order, ...]
-    """Sort order if sorted."""
-    null_order: tuple[plc.types.NullOrder, ...]
-    """Null precedence if sorted."""
-    shuffle_method: ShuffleMethod
-    """Shuffle method to use."""
-
-    def __init__(
-        self,
-        schema: Schema,
-        by: tuple[NamedExpr, ...],
-        order: tuple[plc.types.Order, ...],
-        null_order: tuple[plc.types.NullOrder, ...],
-        shuffle_method: ShuffleMethod,
-        df: IR,
-    ):
-        self.schema = schema
-        self.by = by
-        self.order = order
-        self.null_order = null_order
-        self.shuffle_method = shuffle_method
-        self._non_child_args = (schema, by, order, null_order, shuffle_method)
-        self.children = (df,)
-
-    @classmethod
-    def do_evaluate(
-        cls,
-        schema: Schema,
-        by: tuple[NamedExpr, ...],
-        order: tuple[plc.types.Order, ...],
-        null_order: tuple[plc.types.NullOrder, ...],
-        shuffle_method: ShuffleMethod,
-        df: DataFrame,
-        *,
-        context: IRExecutionContext,
-    ) -> DataFrame:  # pragma: no cover
-        """Evaluate and return a dataframe."""
-        # Single-partition ShuffleSorted evaluation is a no-op
-        return df
-
-
 def _has_simple_zlice(zlice: tuple[int, int | None] | None) -> bool:
     """Check if a zlice is a simple top-k/bottom-k operation."""
     if zlice is None:
@@ -517,26 +258,7 @@ def _(
             msg="sort currently only supports column names as `by` keys.",
         )
 
-    config_options = rec.state["config_options"]
-    executor = config_options.executor
-    runtime = executor.runtime
-
-    # Special handling for slicing
-    # (May be a top- or bottom-k operation)
-    simple_zlice = _has_simple_zlice(ir.zlice)
-    if simple_zlice and runtime == "tasks":
-        from cudf_polars.experimental.parallel import _lower_ir_pwise
-
-        new_node, partition_info = _lower_ir_pwise(ir, rec)
-        if partition_info[new_node].count > 1:
-            # Collapse down to single partition
-            inter = Repartition(new_node.schema, new_node)
-            partition_info[inter] = PartitionInfo(count=1)
-            # Sort reduced partition
-            new_node = ir.reconstruct([inter])
-            partition_info[new_node] = PartitionInfo(count=1)
-        return new_node, partition_info
-    elif ir.zlice is not None and not simple_zlice:
+    if ir.zlice is not None and not _has_simple_zlice(ir.zlice):
         # Pull "complex" slices out of the Sort node altogether.
         return rec(
             Slice(
@@ -557,112 +279,6 @@ def _(
     # Extract child partitioning
     child, partition_info = rec(ir.children[0])
 
-    # The "rapidsmpf" runtime uses the sort_actor to handle everything else
-    if runtime == "rapidsmpf":
-        sort_node = ir.reconstruct([child])
-        partition_info[sort_node] = partition_info[child]
-        return sort_node, partition_info
-
-    # TODO: Remove everything below here when "tasks" is removed.
-
-    # Avoid rapidsmpf shuffle with maintain_order=True (for now)
-    shuffle_method = (
-        ShuffleMethod("tasks") if ir.stable else config_options.executor.shuffle_method
-    )
-    if (
-        shuffle_method != config_options.executor.shuffle_method
-    ):  # pragma: no cover; Requires rapidsmpf
-        _fallback_inform(
-            f"shuffle_method={config_options.executor.shuffle_method} does not support maintain_order=True. "
-            f"Falling back to shuffle_method={shuffle_method}.",
-            config_options,
-        )
-
-    if partition_info[child].count == 1:
-        single_part_node = ir.reconstruct([child])
-        partition_info[single_part_node] = partition_info[child]
-        return single_part_node, partition_info
-
-    local_sort_node = ir.reconstruct([child])
-    partition_info[local_sort_node] = partition_info[child]
-
-    shuffle = ShuffleSorted(
-        ir.schema,
-        ir.by,
-        ir.order,
-        ir.null_order,
-        shuffle_method,
-        local_sort_node,
-    )
-    partition_info[shuffle] = partition_info[child]
-
-    # We sort again locally.
-    assert ir.zlice is None  # zlice handling would be incorrect without adjustment
-    final_sort_node = ir.reconstruct([shuffle])
-    partition_info[final_sort_node] = partition_info[shuffle]
-
-    return final_sort_node, partition_info
-
-
-@generate_ir_tasks.register(ShuffleSorted)
-def _(
-    ir: ShuffleSorted,
-    partition_info: MutableMapping[IR, PartitionInfo],
-    context: IRExecutionContext,
-) -> MutableMapping[Any, Any]:
-    by = [ne.value.name for ne in ir.by if isinstance(ne.value, Col)]
-    if len(by) != len(ir.by):  # pragma: no cover
-        # We should not reach here as this is checked in the lower_ir_node
-        raise NotImplementedError("Sorting columns must be column names.")
-
-    (child,) = ir.children
-
-    sort_boundaries_name, graph = _sort_boundaries_graph(
-        get_key_name(child),
-        by,
-        ir.order,
-        ir.null_order,
-        partition_info[child].count,
-        context,
-    )
-
-    options = {
-        "by": by,
-        "order": ir.order,
-        "null_order": ir.null_order,
-        "column_names": list(ir.schema.keys()),
-        "column_dtypes": list(ir.schema.values()),
-    }
-
-    # Try using rapidsmpf shuffler if we have "simple" shuffle
-    # keys, and the "shuffle_method" config is set to "rapidsmpf-single".
-    shuffle_method = ir.shuffle_method
-    if shuffle_method == "rapidsmpf-single":  # pragma: no cover
-        from rapidsmpf.integrations.single import rapidsmpf_shuffle_graph
-
-        graph.update(
-            rapidsmpf_shuffle_graph(
-                get_key_name(child),
-                get_key_name(ir),
-                partition_info[child].count,
-                partition_info[ir].count,
-                RMPFIntegrationSortedShuffle,
-                options,
-                sort_boundaries_name,
-            )
-        )
-        return graph
-
-    # Simple task-based fall-back
-    graph.update(
-        partial(_simple_shuffle_graph, context=context)(
-            get_key_name(child),
-            get_key_name(ir),
-            partition_info[child].count,
-            partition_info[ir].count,
-            _sort_partition_dataframe,
-            options,
-            sort_boundaries_name,
-        )
-    )
-    return graph
+    sort_node = ir.reconstruct([child])
+    partition_info[sort_node] = partition_info[child]
+    return sort_node, partition_info
diff --git a/python/cudf_polars/cudf_polars/experimental/utils.py b/python/cudf_polars/cudf_polars/experimental/utils.py
index 24ce606d41b..848a4d44759 100644
--- a/python/cudf_polars/cudf_polars/experimental/utils.py
+++ b/python/cudf_polars/cudf_polars/experimental/utils.py
@@ -10,7 +10,7 @@
 from itertools import chain
 from typing import TYPE_CHECKING
 
-from cudf_polars.dsl.expr import Col, Expr, GroupedWindow, UnaryFunction
+from cudf_polars.dsl.expr import Col, GroupedWindow, UnaryFunction
 from cudf_polars.dsl.ir import Union
 from cudf_polars.dsl.traversal import traversal
 from cudf_polars.experimental.base import PartitionInfo
@@ -49,11 +49,8 @@ def _fallback_inform(
 
 
 def _dynamic_planning_on(config_options: ConfigOptions[StreamingExecutor]) -> bool:
-    """Check if dynamic planning is enabled for rapidsmpf runtime."""
-    return (
-        config_options.executor.runtime == "rapidsmpf"
-        and config_options.executor.dynamic_planning is not None
-    )
+    """Check if dynamic planning is enabled."""
+    return config_options.executor.dynamic_planning is not None
 
 
 def _lower_ir_fallback(
@@ -68,9 +65,6 @@ def _lower_ir_fallback(
     from cudf_polars.experimental.repartition import Repartition
     from cudf_polars.experimental.select import _inline_hstack_false
 
-    config_options = rec.state["config_options"]
-    rapidsmpf_engine = config_options.executor.runtime == "rapidsmpf"
-
     # Make sure we avoid mixed-length columns in intermediate TableChunks.
     ir = _inline_hstack_false(ir)
 
@@ -82,13 +76,10 @@ def _lower_ir_fallback(
     children = []
     inform = False
     for c in lowered_children:
-        child = c
-        if multi_partitioned := partition_info[c].count > 1:
+        if partition_info[c].count > 1:
             inform = True
-        if multi_partitioned or rapidsmpf_engine:
-            # Fall-back logic
-            child = Repartition(child.schema, child)
-            partition_info[child] = PartitionInfo(count=1)
+        child = Repartition(c.schema, c)
+        partition_info[child] = PartitionInfo(count=1)
         children.append(child)
 
     if inform and msg:
@@ -114,32 +105,6 @@ def _leaf_column_names(expr: Expr) -> tuple[str, ...]:
         return ()
 
 
-def _get_unique_fractions(
-    column_names: Sequence[str],
-    user_unique_fractions: dict[str, float],
-) -> dict[str, float]:
-    """
-    Return unique-fraction statistics subset.
-
-    Parameters
-    ----------
-    column_names
-        The column names to get unique-fractions for.
-    user_unique_fractions
-        The user-provided unique-fraction dictionary.
-
-    Returns
-    -------
-    unique_fractions
-        The final unique-fraction dictionary filtered to column_names.
-    """
-    return {
-        c: max(min(f, 1.0), 0.00001)
-        for c, f in user_unique_fractions.items()
-        if c in column_names
-    }
-
-
 def _contains_over(exprs: Sequence[Expr]) -> bool:
     """Return True if any expression contains a window expression."""
     return any(isinstance(e, GroupedWindow) for e in traversal(exprs))
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index 9f0953cd4df..5611f8c3e70 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -30,7 +30,6 @@
 # Will be overriden by `conftest.py` with the value from the `--executor`
 # and `--cluster` command-line arguments
 DEFAULT_EXECUTOR = "in-memory"
-DEFAULT_RUNTIME = "tasks"
 DEFAULT_CLUSTER = "single"
 
 
@@ -200,7 +199,6 @@ def get_default_engine(
     executor = executor or DEFAULT_EXECUTOR
     if executor == "streaming":
         executor_options["cluster"] = DEFAULT_CLUSTER
-        executor_options["runtime"] = DEFAULT_RUNTIME
 
     return GPUEngine(
         raise_on_fail=True,
@@ -290,7 +288,8 @@ def assert_collect_raises(
         if polars_except != ():
             raise AssertionError(f"CPU execution DID NOT RAISE {polars_except}")
 
-    engine = GPUEngine(raise_on_fail=True)
+    # TODO: https://github.com/rapidsai/cudf/issues/22346
+    engine = GPUEngine(executor="in-memory", raise_on_fail=True)
     try:
         lazydf.collect(**final_cudf_collect_kwargs, engine=engine)  # type: ignore[misc, call-overload]
     except cudf_except:
diff --git a/python/cudf_polars/cudf_polars/testing/inject_gpu_engine.py b/python/cudf_polars/cudf_polars/testing/inject_gpu_engine.py
index 6fe2de4d154..7cfb62c414e 100644
--- a/python/cudf_polars/cudf_polars/testing/inject_gpu_engine.py
+++ b/python/cudf_polars/cudf_polars/testing/inject_gpu_engine.py
@@ -30,6 +30,7 @@ def pytest_addoption(parser: pytest.Parser) -> None:
         choices=("in-memory", "spmd"),
         help="Which GPU engine variant to inject globally.",
     )
+    # TODO: We never run with --inject-gpu-engine-blocksize in ci/run_cudf_polars_polars_tests.sh. Remove?
     group.addoption(
         "--inject-gpu-engine-blocksize",
         action="store",
@@ -134,6 +135,7 @@ def pytest_report_header(config: pytest.Config) -> str:
     return f"injected GPU engine: {cls.__module__}.{cls.__name__}"
 
 
+# TODO: This is just Mapping[str, str]?
 EXPECTED_FAILURES: Mapping[str, str | tuple[str, bool]] = {
     "tests/unit/io/test_csv.py::test_read_csv_only_loads_selected_columns": "Memory usage won't be correct due to GPU",
     "tests/unit/io/test_delta.py::test_scan_delta_version": "Need to expose hive partitioning",
@@ -305,7 +307,6 @@ def pytest_report_header(config: pytest.Config) -> str:
 
 # Generally skip for:
 # 1) Tests that are too slow with --inject-gpu-engine-blocksize=small due to many small partitions for large data
-# 2) Tests that fail during cudf_polars execution and segfaults later due to https://github.com/rapidsai/cudf/issues/22138
 STREAMING_ENGINE_TESTS_TO_SKIP: Mapping[str, str] = {
     "tests/unit/operations/aggregation/test_aggregations.py::test_boolean_aggs": "float difference in std/var in the unit of least precision",
     "tests/benchmark/test_group_by.py::test_groupby_h2oai_q1": "Too slow with --inject-gpu-engine-blocksize=small",
diff --git a/python/cudf_polars/cudf_polars/utils/config.py b/python/cudf_polars/cudf_polars/utils/config.py
index a6bbd73929b..7b5fb5c940c 100644
--- a/python/cudf_polars/cudf_polars/utils/config.py
+++ b/python/cudf_polars/cudf_polars/utils/config.py
@@ -55,9 +55,7 @@
     "InMemoryExecutor",
     "ParquetOptions",
     "RayContext",
-    "Runtime",
     "SPMDContext",
-    "ShuffleMethod",
     "StreamingExecutor",
     "StreamingFallbackMode",
 ]
@@ -112,15 +110,6 @@ def get_total_device_memory() -> int | None:
         return None
 
 
-@functools.cache
-def rapidsmpf_single_available() -> bool:  # pragma: no cover
-    """Query whether rapidsmpf is available as a single-process shuffle method."""
-    try:
-        return importlib.util.find_spec("rapidsmpf.integrations.single") is not None
-    except (ImportError, ValueError):
-        return False
-
-
 class StreamingFallbackMode(enum.StrEnum):
     """
     How the streaming executor handles operations that don't support multiple partitions.
@@ -138,20 +127,6 @@ class StreamingFallbackMode(enum.StrEnum):
     SILENT = "silent"
 
 
-class Runtime(enum.StrEnum):
-    """
-    The runtime to use for the streaming executor.
-
-    * ``Runtime.TASKS`` : Use the task-based runtime.
-      This is the default runtime.
-    * ``Runtime.RAPIDSMPF`` : Use the coroutine-based streaming runtime (rapidsmpf).
-      This runtime is experimental.
-    """
-
-    TASKS = "tasks"
-    RAPIDSMPF = "rapidsmpf"
-
-
 class Cluster(enum.StrEnum):
     """
     The cluster configuration for the streaming executor.
@@ -172,27 +147,6 @@ class Cluster(enum.StrEnum):
     DASK = "dask"
 
 
-class ShuffleMethod(enum.StrEnum):
-    """
-    The method to use for shuffling data between workers with the streaming executor.
-
-    * ``ShuffleMethod.TASKS`` : Use the task-based shuffler.
-    * ``ShuffleMethod.RAPIDSMPF`` : Use the rapidsmpf shuffler.
-    * ``ShuffleMethod._RAPIDSMPF_SINGLE`` : Use the single-process rapidsmpf shuffler.
-
-    With :class:`cudf_polars.utils.config.StreamingExecutor`, the default of ``None``
-    resolves to ``ShuffleMethod.TASKS``.
-
-    The user should **not** specify ``ShuffleMethod._RAPIDSMPF_SINGLE`` directly.
-    A setting of ``ShuffleMethod.RAPIDSMPF`` will be converted to the single-process
-    shuffler automatically when using single-GPU execution.
-    """
-
-    TASKS = "tasks"
-    RAPIDSMPF = "rapidsmpf"
-    _RAPIDSMPF_SINGLE = "rapidsmpf-single"
-
-
 T = TypeVar("T")
 
 
@@ -254,7 +208,7 @@ class ParquetOptions:
         will also be skipped if ``max_footer_samples`` is 0.
     use_rapidsmpf_native
         Whether to use the native rapidsmpf node for parquet reading.
-        This option is only used when the rapidsmpf runtime is enabled.
+        This option is only used by the streaming executor.
         Default is False.
     """
 
@@ -315,49 +269,32 @@ def __post_init__(self) -> None:  # noqa: D105
             raise TypeError("use_rapidsmpf_native must be a bool")
 
 
-def default_target_partition_size(cluster: str, runtime: str) -> int:
+@functools.cache
+def default_target_partition_size() -> int:
     """Return the default blocksize."""
     if (device_size := get_total_device_memory()) is None:  # pragma: no cover
         # System doesn't have proper "GPU memory".
         # Fall back to a conservative 1GB default.
         return 1_000_000_000
 
-    if (
-        cluster == "single"
-        and runtime == "tasks"
-        and _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) == 1
-    ):
-        # We can use a larger blocksize when UVM is enabled
-        blocksize = int(device_size * 0.0625)
-    else:
-        # Otherwise, use a conservative default
-        blocksize = int(device_size * 0.025)
+    blocksize = int(device_size * 0.025)
 
     # Use lower and upper bounds of 1GB and 10GB
     return min(max(blocksize, 1_000_000_000), 10_000_000_000)
 
 
-def default_broadcast_join_limit(cluster: str, runtime: str) -> int:
+@functools.cache
+def default_broadcast_join_limit() -> int:
     """Return the default broadcast join limit."""
     if (device_size := get_total_device_memory()) is None:  # pragma: no cover
         # System doesn't have proper "GPU memory".
         # We probably want to broadcast in most cases.
         return 32
 
-    if runtime == "rapidsmpf":
-        # Target about 12.5% of the device memory when
-        # default_target_partition_size is used to set the
-        # target partition size (i.e. 5x the 2.5% default).
-        return min(5, int(max(1, (device_size * 0.125) // 1e9)))
-    elif _env_get_int("POLARS_GPU_ENABLE_CUDA_MANAGED_MEMORY", default=1) == 1:
-        # The "tasks" runtime always runs single-GPU; we can lean on UVM
-        # to support most broadcast joins.
-        return 32
-    else:
-        # Extra-conservative default for the "tasks" runtime without UVM.
-        # We cannot spill outside a rapidsmpf shuffle within this runtime,
-        # so shuffling is usually preferred.
-        return 2
+    # Target about 12.5% of the device memory when
+    # default_target_partition_size is used to set the
+    # target partition size (i.e. 5x the 2.5% default).
+    return min(5, int(max(1, (device_size * 0.125) // 1e9)))
 
 
 @dataclasses.dataclass(frozen=True)
@@ -599,17 +536,14 @@ class StreamingExecutor:
 
     Parameters
     ----------
-    runtime
-        The runtime to use for the streaming executor.
-        ``Runtime.TASKS`` by default.
     cluster
         The cluster configuration for the streaming executor.
         ``Cluster.SINGLE`` by default.
 
         * ``Cluster.SINGLE``: Single-GPU execution
-        * ``Cluster.SPMD``: Multi-GPU SPMD execution (rapidsmpf runtime)
-        * ``Cluster.RAY``: Multi-GPU Ray execution (rapidsmpf runtime)
-        * ``Cluster.DASK``: Multi-GPU Dask execution (rapidsmpf runtime)
+        * ``Cluster.SPMD``: Multi-GPU SPMD execution
+        * ``Cluster.RAY``: Multi-GPU Ray execution
+        * ``Cluster.DASK``: Multi-GPU Dask execution
 
     fallback_mode
         How to handle errors when the GPU engine fails to execute a query.
@@ -621,13 +555,6 @@ class StreamingExecutor:
         The maximum number of rows to process per partition. 1_000_000 by default.
         When the number of rows exceeds this value, the query will be split into
         multiple partitions and executed in parallel.
-    unique_fraction
-        A dictionary mapping column names to floats between 0 and 1 (inclusive
-        on the right).
-
-        Each factor estimates the fractional number of unique values in the
-        column. By default, ``1.0`` is used for any column not included in
-        ``unique_fraction``.
     target_partition_size
         Target partition size, in bytes, for IO tasks. This configuration currently
         controls how large parquet files are split into multiple partitions.
@@ -639,11 +566,8 @@ class StreamingExecutor:
         - keyword argument to ``polars.GPUEngine``
         - the ``CUDF_POLARS__EXECUTOR__TARGET_PARTITION_SIZE`` environment variable
 
-        By default, cudf-polars uses a target partition size that's a fraction
-        of the device memory, where the fraction depends on the cluster and runtime:
-
-        - rapidsmpf runtime: 1/40th of the device memory
-        - single cluster and tasks runtime: 1/16th of the device memory
+        By default, cudf-polars uses a target partition size of 1/40th of the
+        device memory.
 
         The pynvml library is used to query the total device memory on the first
         visible GPU. If the device size is not available, the default target
@@ -651,26 +575,14 @@ class StreamingExecutor:
 
         NOTE: If this configuration is changed manually, it is recommended to set
         `broadcast_join_limit` manually as well.
-    groupby_n_ary
-        The factor by which the number of partitions is decreased when performing
-        a groupby on a partitioned column. For example, if a column has 64 partitions,
-        it will first be reduced to ``ceil(64 / 32) = 2`` partitions.
-
-        This is useful when the absolute number of partitions is large.
     broadcast_join_limit
         The maximum number of partitions to allow for the smaller table in
         a broadcast join. For example, if the target partition size is 1GB and the
         broadcast join limit is 5, then the smaller table will be broadcasted
-        if it is smaller than 5GB (within the "rapidsmpf" runtime) or contains
-        fewer than 5 partitions (within the "tasks" runtime). The default depends
-        on the cluster and runtime.
-    shuffle_method
-        The method to use for shuffling data between workers. Defaults to
-        'tasks' for the single-GPU cluster.
+        if it is smaller than 5GB.
     client_device_threshold
-        Threshold for spilling data from device memory in rapidsmpf.
+        Threshold for spilling data from device memory.
         Default is 50% of device memory on the client process.
-        This argument is only used by the "rapidsmpf" runtime.
     sink_to_directory
         Whether multi-partition sink operations write to a directory rather
         than a single file. For the spmd, ray, and dask clusters this is
@@ -680,7 +592,7 @@ class StreamingExecutor:
         Options controlling dynamic shuffle planning. See
         :class:`~cudf_polars.utils.config.DynamicPlanningOptions` for more.
     max_io_threads
-        Maximum number of IO threads for the rapidsmpf runtime. Default is 4.
+        Maximum number of IO threads. Default is 4.
         This controls the parallelism of IO operations when reading data.
     spill_to_pinned_memory
         Whether RapidsMPF should spill to pinned host memory when available,
@@ -688,8 +600,8 @@ class StreamingExecutor:
         bandwidth and lower latency for device to host transfers compared to
         regular pageable host memory.
     num_py_executors
-        Maximum number of workers for the Python ThreadPoolExecutor used by
-        the rapidsmpf runtime. Default is 8.
+        Maximum number of workers for the Python ThreadPoolExecutor.
+        Default is 8.
 
     Notes
     -----
@@ -700,13 +612,6 @@ class StreamingExecutor:
     _env_prefix = "CUDF_POLARS__EXECUTOR"
 
     name: Literal["streaming"] = dataclasses.field(default="streaming", init=False)
-    runtime: Runtime = dataclasses.field(
-        default_factory=_make_default_factory(
-            f"{_env_prefix}__RUNTIME",
-            Runtime.__call__,
-            default=Runtime.TASKS,
-        )
-    )
     cluster: Cluster | None = dataclasses.field(
         default_factory=_make_default_factory(
             f"{_env_prefix}__CLUSTER",
@@ -726,33 +631,16 @@ class StreamingExecutor:
             f"{_env_prefix}__MAX_ROWS_PER_PARTITION", int, default=1_000_000
         )
     )
-    unique_fraction: dict[str, float] = dataclasses.field(
-        default_factory=_make_default_factory(
-            f"{_env_prefix}__UNIQUE_FRACTION", json.loads, default={}
-        )
-    )
     target_partition_size: int = dataclasses.field(
         default_factory=_make_default_factory(
             f"{_env_prefix}__TARGET_PARTITION_SIZE", int, default=0
         )
     )
-    groupby_n_ary: int = dataclasses.field(
-        default_factory=_make_default_factory(
-            f"{_env_prefix}__GROUPBY_N_ARY", int, default=32
-        )
-    )
     broadcast_join_limit: int = dataclasses.field(
         default_factory=_make_default_factory(
             f"{_env_prefix}__BROADCAST_JOIN_LIMIT", int, default=0
         )
     )
-    shuffle_method: ShuffleMethod = dataclasses.field(
-        default_factory=_make_default_factory(
-            f"{_env_prefix}__SHUFFLE_METHOD",
-            ShuffleMethod.__call__,
-            default=ShuffleMethod.TASKS,
-        )
-    )
     client_device_threshold: float = dataclasses.field(
         default_factory=_make_default_factory(
             f"{_env_prefix}__CLIENT_DEVICE_THRESHOLD", float, default=0.5
@@ -786,33 +674,10 @@ class StreamingExecutor:
     dask_context: DaskContext | None = None
 
     def __post_init__(self) -> None:  # noqa: D105
-        # Check for rapidsmpf runtime
-        if self.runtime == "rapidsmpf":  # pragma: no cover; requires rapidsmpf runtime
-            if not rapidsmpf_single_available():
-                raise ValueError("The rapidsmpf streaming engine requires rapidsmpf.")
-            object.__setattr__(self, "shuffle_method", "rapidsmpf")
-
         if self.cluster is None:
             object.__setattr__(self, "cluster", Cluster.SINGLE)
         assert self.cluster is not None, "Expected cluster to be set."
 
-        # Handle shuffle_method defaults for streaming executor
-        if self.shuffle_method is None:
-            # Use task-based shuffle by default.
-            # TODO: Evaluate single-process shuffle by default.
-            object.__setattr__(self, "shuffle_method", "tasks")
-        elif self.shuffle_method == "rapidsmpf-single":
-            # The user should NOT specify "rapidsmpf-single" directly.
-            raise ValueError("rapidsmpf-single is not a supported shuffle method.")
-        elif self.shuffle_method == "rapidsmpf":
-            if self.cluster == "single" and not rapidsmpf_single_available():
-                raise ValueError(
-                    "rapidsmpf shuffle method requested, but rapidsmpf is not installed."
-                )
-            # Select "rapidsmpf-single" for single-GPU
-            if self.cluster == "single":
-                object.__setattr__(self, "shuffle_method", "rapidsmpf-single")
-
         # frozen dataclass, so use object.__setattr__
         object.__setattr__(
             self, "fallback_mode", StreamingFallbackMode(self.fallback_mode)
@@ -821,16 +686,15 @@ def __post_init__(self) -> None:  # noqa: D105
             object.__setattr__(
                 self,
                 "target_partition_size",
-                default_target_partition_size(self.cluster, self.runtime),
+                default_target_partition_size(),
             )
         if self.broadcast_join_limit == 0:
             object.__setattr__(
                 self,
                 "broadcast_join_limit",
-                default_broadcast_join_limit(self.cluster, self.runtime),
+                default_broadcast_join_limit(),
             )
         object.__setattr__(self, "cluster", Cluster(self.cluster))
-        object.__setattr__(self, "shuffle_method", ShuffleMethod(self.shuffle_method))
 
         # Handle dynamic_planning.
         # Can be None, dict, or DynamicPlanningOptions
@@ -853,12 +717,8 @@ def __post_init__(self) -> None:  # noqa: D105
         # Type / value check everything else
         if not isinstance(self.max_rows_per_partition, int):
             raise TypeError("max_rows_per_partition must be an int")
-        if not isinstance(self.unique_fraction, dict):
-            raise TypeError("unique_fraction must be a dict of column name to float")
         if not isinstance(self.target_partition_size, int):
             raise TypeError("target_partition_size must be an int")
-        if not isinstance(self.groupby_n_ary, int):
-            raise TypeError("groupby_n_ary must be an int")
         if not isinstance(self.broadcast_join_limit, int):
             raise TypeError("broadcast_join_limit must be an int")
         if not isinstance(self.sink_to_directory, bool):
@@ -873,10 +733,9 @@ def __post_init__(self) -> None:  # noqa: D105
             raise TypeError("num_py_executors must be an int")
 
     def __hash__(self) -> int:  # noqa: D105
-        # cardinality factory, a dict, isn't natively hashable. We'll dump it
+        # dynamic_planning factory, a dataclass, isn't natively hashable. We'll dump it
         # to json and hash that.
         d = dataclasses.asdict(self)
-        d["unique_fraction"] = json.dumps(d["unique_fraction"])
         d["dynamic_planning"] = json.dumps(d["dynamic_planning"])
         return hash(tuple(sorted(d.items())))
 
@@ -1059,19 +918,6 @@ def from_polars_engine(
                 executor = InMemoryExecutor(**user_executor_options)
             case "streaming":
                 user_executor_options = user_executor_options.copy()
-                # Handle the interaction between the default shuffle method, the
-                # cluster, and whether rapidsmpf is available.
-                env_shuffle_method = os.environ.get(
-                    "CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", None
-                )
-                if env_shuffle_method is not None:
-                    shuffle_method_default = ShuffleMethod(env_shuffle_method)
-                else:
-                    shuffle_method_default = None
-
-                user_executor_options.setdefault(
-                    "shuffle_method", shuffle_method_default
-                )
 
                 # Handle dynamic_planning: check user config, then env var
                 user_dynamic_planning = user_executor_options.get(
@@ -1097,7 +943,7 @@ def from_polars_engine(
         }
 
         # Handle "cuda-stream-policy".
-        # The default will depend on the runtime and executor.
+        # The default will depend on the executor.
         user_cuda_stream_policy = engine.config.get(
             "cuda_stream_policy", None
         ) or os.environ.get("CUDF_POLARS__CUDA_STREAM_POLICY", None)
@@ -1105,24 +951,18 @@ def from_polars_engine(
         cuda_stream_policy: CUDAStreamPoolConfig | None
 
         if user_cuda_stream_policy is None:
-            if (
-                executor.name == "streaming" and executor.runtime == Runtime.RAPIDSMPF
-            ):  # pragma: no cover; requires rapidsmpf runtime
-                # the rapidsmpf runtime defaults to using a stream pool
+            if executor.name == "streaming":
                 cuda_stream_policy = CUDAStreamPoolConfig()
             else:
-                # everything else defaults to the default stream
                 cuda_stream_policy = None
         else:
             cuda_stream_policy = _convert_cuda_stream_policy(user_cuda_stream_policy)
 
-        # Pool policy is only supported by the rapidsmpf runtime.
         if isinstance(cuda_stream_policy, CUDAStreamPoolConfig) and (
-            (executor.name != "streaming")
-            or (executor.name == "streaming" and executor.runtime != Runtime.RAPIDSMPF)
+            executor.name != "streaming"
         ):
             raise ValueError(
-                "A stream pool is only supported by the rapidsmpf runtime."
+                "A stream pool is only supported by the streaming executor."
             )
 
         kwargs["cuda_stream_policy"] = cuda_stream_policy
diff --git a/python/cudf_polars/cudf_polars/utils/cuda_stream.py b/python/cudf_polars/cudf_polars/utils/cuda_stream.py
index c0708d3bea8..22022ee3401 100644
--- a/python/cudf_polars/cudf_polars/utils/cuda_stream.py
+++ b/python/cudf_polars/cudf_polars/utils/cuda_stream.py
@@ -17,11 +17,6 @@
     from rmm.pylibrmm.stream import Stream
 
 
-def get_dask_cuda_stream() -> Stream:
-    """Get the default CUDA stream for Dask."""
-    return DEFAULT_STREAM
-
-
 def get_cuda_stream() -> Stream:
     """Get the default CUDA stream for the current thread."""
     return DEFAULT_STREAM
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index d48793f0541..47633e42364 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -26,6 +26,7 @@ dependencies = [
     "packaging",
     "polars>=1.30,<1.39",
     "pylibcudf==26.6.*,>=0.0.0a0",
+    "rapidsmpf==26.6.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -47,7 +48,6 @@ test = [
     "pytest-cov",
     "pytest-httpserver",
     "pytest-xdist",
-    "rapidsmpf==26.6.*,>=0.0.0a0",
     "rich",
     "structlog",
     "zstandard",
diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py
index 7f00684638f..b3d83b36d36 100644
--- a/python/cudf_polars/tests/conftest.py
+++ b/python/cudf_polars/tests/conftest.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import importlib.util
 from typing import TYPE_CHECKING
 
 import pytest
@@ -54,13 +53,6 @@ def clear_memory_resource_cache():
 @pytest.fixture(autouse=True)
 def _skip_unless_spmd(request: pytest.FixtureRequest) -> None:
     """Skip tests in SPMD multi-rank mode unless marked with ``pytest.mark.spmd``."""
-    # Do not use `pytest.importorskip` here: this fixture is autouse, so an
-    # import-based skip would skip every test in the suite on environments
-    # without rapidsmpf (e.g. the coverage CI job), masking real coverage.
-    # We only want to gate the nranks>1 check on rapidsmpf being available.
-    if importlib.util.find_spec("rapidsmpf") is None:
-        return
-
     from rapidsmpf.bootstrap import get_nranks, is_running_with_rrun
 
     if (
@@ -79,7 +71,6 @@ def streaming_engines() -> Generator[StreamingEngines, None, None]:
     name to a single shared engine instance, which is reused across the entire
     test session.
     """
-    pytest.importorskip("rapidsmpf")
     from rapidsmpf import bootstrap
     from rapidsmpf.communicator.single import new_communicator as single_communicator
     from rapidsmpf.config import Options, get_environment_variables
@@ -228,7 +219,8 @@ def engine_raise_on_fail() -> pl.GPUEngine:
     from ``.collect()``. Uses the in-memory executor so errors are not wrapped
     by a streaming task group.
     """
-    return pl.GPUEngine(raise_on_fail=True)
+    # TODO: We should be testing will all supported engine variants
+    return pl.GPUEngine(executor="in-memory", raise_on_fail=True)
 
 
 def pytest_addoption(parser):
@@ -240,14 +232,6 @@ def pytest_addoption(parser):
         help="Executor to use for GPUEngine.",
     )
 
-    parser.addoption(
-        "--runtime",
-        action="store",
-        default="tasks",
-        choices=("tasks", "rapidsmpf"),
-        help="Runtime to use for the 'streaming' executor.",
-    )
-
     parser.addoption(
         "--cluster",
         action="store",
@@ -278,17 +262,7 @@ def pytest_configure(config):
     # apply globally rather than per-module.
     config.addinivalue_line("filterwarnings", "ignore::ResourceWarning")
 
-    if config.getoption("--runtime") == "rapidsmpf":
-        if config.getoption("--executor") == "in-memory":
-            raise pytest.UsageError("Rapidsmpf runtime requires --executor='streaming'")
-
-        if importlib.util.find_spec("rapidsmpf") is None:
-            raise pytest.UsageError(
-                "Rapidsmpf runtime requires the 'rapidsmpf' package"
-            )
-
     cudf_polars.testing.asserts.DEFAULT_EXECUTOR = config.getoption("--executor")
-    cudf_polars.testing.asserts.DEFAULT_RUNTIME = config.getoption("--runtime")
     cudf_polars.testing.asserts.DEFAULT_CLUSTER = config.getoption("--cluster")
 
 
diff --git a/python/cudf_polars/tests/experimental/test_dask.py b/python/cudf_polars/tests/experimental/test_dask.py
index 5ccdde864ef..93ef4318490 100644
--- a/python/cudf_polars/tests/experimental/test_dask.py
+++ b/python/cudf_polars/tests/experimental/test_dask.py
@@ -64,7 +64,6 @@ def test_yields_engine(engine: DaskEngine) -> None:
 def test_executor_options_forwarded(engine: DaskEngine) -> None:
     """Reserved executor_options keys are injected into the engine config."""
     opts = engine.config["executor_options"]
-    assert opts["runtime"] == "rapidsmpf"
     assert opts["cluster"] == "dask"
     assert isinstance(opts["dask_context"], DaskContext)
 
@@ -196,7 +195,6 @@ def test_reset_updates_executor_options(reset_engine: DaskEngine) -> None:
     opts = reset_engine.config["executor_options"]
     assert opts["max_rows_per_partition"] == 42
     # Reserved keys are still injected by ``_reset``.
-    assert opts["runtime"] == "rapidsmpf"
     assert opts["cluster"] == "dask"
     assert isinstance(opts["dask_context"], DaskContext)
 
diff --git a/python/cudf_polars/tests/experimental/test_explain.py b/python/cudf_polars/tests/experimental/test_explain.py
index fecd4ba4d03..7f19e318778 100644
--- a/python/cudf_polars/tests/experimental/test_explain.py
+++ b/python/cudf_polars/tests/experimental/test_explain.py
@@ -540,8 +540,7 @@ def test_scan_properties(tmp_path: Path, predicate: pl.Expr | None):
     engine = pl.GPUEngine(executor="streaming", raise_on_fail=True)
     dag = serialize_query(q, engine)
 
-    # walk Union -> Scan
-    node = dag.nodes[dag.nodes[dag.roots[0]].children[0]]
+    node = dag.nodes[dag.roots[0]]
     assert node.type == "Scan"
     assert node.properties == expected_properties
 
@@ -673,7 +672,6 @@ def test_dynamic_planning_adds_repartition(df, op):
         executor="streaming",
         raise_on_fail=True,
         executor_options={
-            "runtime": "rapidsmpf",
             "dynamic_planning": {},
             "max_rows_per_partition": 1_000_000,
         },
diff --git a/python/cudf_polars/tests/experimental/test_groupby.py b/python/cudf_polars/tests/experimental/test_groupby.py
index 8d6ac5927e9..03d87fe23e9 100644
--- a/python/cudf_polars/tests/experimental/test_groupby.py
+++ b/python/cudf_polars/tests/experimental/test_groupby.py
@@ -270,10 +270,7 @@ def test_groupby_literal_key(df, streaming_engine):
 @pytest.mark.parametrize("keys", [("y",), ("y", "z")])
 def test_groupby_agg_config_options(df, op, keys, streaming_engine_factory):
     streaming_engine = streaming_engine_factory(
-        StreamingOptions(
-            max_rows_per_partition=4,
-            unique_fraction={"z": 0.5},
-        ),
+        StreamingOptions(max_rows_per_partition=4),
     )
     agg = getattr(pl.col("x"), op)()
     if op in ("sum", "mean"):
diff --git a/python/cudf_polars/tests/experimental/test_hstack.py b/python/cudf_polars/tests/experimental/test_hstack.py
index 9bbb4b7aa33..0c21678f7e2 100644
--- a/python/cudf_polars/tests/experimental/test_hstack.py
+++ b/python/cudf_polars/tests/experimental/test_hstack.py
@@ -20,7 +20,6 @@
 from cudf_polars.experimental.statistics import collect_statistics
 from cudf_polars.testing.asserts import (
     DEFAULT_CLUSTER,
-    DEFAULT_RUNTIME,
     assert_gpu_result_equal,
 )
 from cudf_polars.utils.config import ConfigOptions
@@ -34,7 +33,6 @@ def engine():
         executor_options={
             "max_rows_per_partition": 3,
             "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
         },
     )
 
diff --git a/python/cudf_polars/tests/experimental/test_options.py b/python/cudf_polars/tests/experimental/test_options.py
index 291cbda7589..eb64cb97ed8 100644
--- a/python/cudf_polars/tests/experimental/test_options.py
+++ b/python/cudf_polars/tests/experimental/test_options.py
@@ -66,11 +66,6 @@ def test_executor_options_includes_set_fields() -> None:
     assert "log" not in result
 
 
-def test_executor_options_unique_fraction() -> None:
-    result = StreamingOptions(unique_fraction={"col_a": 0.5}).to_executor_options()
-    assert result["unique_fraction"] == {"col_a": 0.5}
-
-
 def test_executor_options_num_py_executors() -> None:
     result = StreamingOptions(num_py_executors=4).to_executor_options()
     assert result["num_py_executors"] == 4
diff --git a/python/cudf_polars/tests/experimental/test_parallel.py b/python/cudf_polars/tests/experimental/test_parallel.py
index 42365a113e2..67fc372e2e4 100644
--- a/python/cudf_polars/tests/experimental/test_parallel.py
+++ b/python/cudf_polars/tests/experimental/test_parallel.py
@@ -12,13 +12,9 @@
 from polars.testing import assert_frame_equal
 
 from cudf_polars import Translator
-from cudf_polars.dsl.expressions.base import Col, NamedExpr
 from cudf_polars.dsl.traversal import traversal
-from cudf_polars.experimental.parallel import lower_ir_graph
 from cudf_polars.experimental.rapidsmpf.frontend.options import StreamingOptions
-from cudf_polars.experimental.statistics import collect_statistics
 from cudf_polars.testing.asserts import assert_gpu_result_equal
-from cudf_polars.utils.config import ConfigOptions
 
 
 @pytest.mark.parametrize("column", ["a", "b"])
@@ -134,40 +130,3 @@ def test_pickle_conditional_join_args():
     ir = Translator(q._ldf.visit(), GPUEngine()).translate_ir()
     for node in traversal([ir]):
         pickle.loads(pickle.dumps(node._non_child_args))
-
-
-def test_preserve_partitioning(streaming_engine_factory):
-    streaming_engine = streaming_engine_factory(
-        StreamingOptions(
-            max_rows_per_partition=2,
-            broadcast_join_limit=2,
-            unique_fraction={"a": 1.0},
-        ),
-    )
-    left = pl.LazyFrame({"a": [1, 2, 3, 4] * 5, "b": range(20)})
-    right = pl.LazyFrame({"a": [3, 4, 5, 6, 7] * 4, "c": range(20)})
-    q = (
-        left.join(right, on="a")
-        .filter(pl.col("a") == 2)
-        .group_by(pl.col("a"))
-        .mean()
-        .select(pl.col("a"), pl.col("c"))
-    )
-    _engine = pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "max_rows_per_partition": 2,
-            "broadcast_join_limit": 2,
-            "unique_fraction": {"a": 1.0},
-        },
-    )
-    config_options = ConfigOptions.from_polars_engine(_engine)
-    ir = Translator(q._ldf.visit(), _engine).translate_ir()
-    ir, partition_info = lower_ir_graph(
-        ir, config_options, collect_statistics(ir, config_options)
-    )
-    expect_dtype = ir.schema["a"]
-    expect_expr = (NamedExpr("a", Col(expect_dtype, "a")),)
-    assert partition_info[ir].partitioned_on == expect_expr
-    assert_gpu_result_equal(q, engine=streaming_engine)
diff --git a/python/cudf_polars/tests/experimental/test_ray.py b/python/cudf_polars/tests/experimental/test_ray.py
index ded4903c594..f62c3e3b831 100644
--- a/python/cudf_polars/tests/experimental/test_ray.py
+++ b/python/cudf_polars/tests/experimental/test_ray.py
@@ -53,7 +53,7 @@ def engine() -> Iterator[RayEngine]:
 
 def test_reserved_executor_keys() -> None:
     """executor_options rejects reserved keys."""
-    for key in ("runtime", "cluster", "spmd_context", "ray_context"):
+    for key in ("cluster", "spmd_context", "ray_context"):
         with pytest.raises(TypeError, match="reserved"):
             RayEngine(executor_options={key: "anything"})
 
@@ -109,7 +109,6 @@ def test_executor_options_forwarded(
 ) -> None:
     """Reserved executor_options keys are injected into the engine config."""
     opts = engine.config["executor_options"]
-    assert opts["runtime"] == "rapidsmpf"
     assert opts["cluster"] == "ray"
     assert isinstance(opts["ray_context"], RayContext)
     assert engine.rank_actors == opts["ray_context"].rank_actors
@@ -258,7 +257,6 @@ def test_reset_updates_executor_options(reset_engine: RayEngine) -> None:
     opts = reset_engine.config["executor_options"]
     assert opts["max_rows_per_partition"] == 42
     # Reserved keys are still injected by ``_reset``.
-    assert opts["runtime"] == "rapidsmpf"
     assert opts["cluster"] == "ray"
     assert isinstance(opts["ray_context"], RayContext)
     assert opts["ray_context"].rank_actors == reset_engine.rank_actors
diff --git a/python/cudf_polars/tests/experimental/test_sort.py b/python/cudf_polars/tests/experimental/test_sort.py
index 194686acf6b..f0abf5caade 100644
--- a/python/cudf_polars/tests/experimental/test_sort.py
+++ b/python/cudf_polars/tests/experimental/test_sort.py
@@ -9,7 +9,6 @@
 
 from cudf_polars.testing.asserts import (
     DEFAULT_CLUSTER,
-    DEFAULT_RUNTIME,
     assert_gpu_result_equal,
 )
 
@@ -22,7 +21,6 @@ def engine():
         executor_options={
             "max_rows_per_partition": 3,
             "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
             "fallback_mode": "raise",
         },
     )
@@ -36,7 +34,6 @@ def engine_large():
         executor_options={
             "max_rows_per_partition": 2_100,
             "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
             "fallback_mode": "raise",
         },
     )
@@ -139,7 +136,6 @@ def test_sort_after_sparse_join():
         executor="streaming",
         executor_options={
             "cluster": DEFAULT_CLUSTER,
-            "runtime": DEFAULT_RUNTIME,
             "max_rows_per_partition": 4,
         },
     )
diff --git a/python/cudf_polars/tests/experimental/test_spmd.py b/python/cudf_polars/tests/experimental/test_spmd.py
index 9fef0e00350..96ec5eab932 100644
--- a/python/cudf_polars/tests/experimental/test_spmd.py
+++ b/python/cudf_polars/tests/experimental/test_spmd.py
@@ -66,7 +66,7 @@ def test_single_communicator_outside_rrun() -> None:
 
 def test_reserved_keys() -> None:
     """executor_options rejects reserved keys."""
-    for key in ("runtime", "cluster", "spmd_context"):
+    for key in ("cluster", "spmd_context"):
         with (
             pytest.raises(TypeError, match="reserved"),
             SPMDEngine(executor_options={key: "anything"}),
@@ -320,7 +320,6 @@ def test_reset_updates_executor_options(comm: Communicator) -> None:
         opts = engine.config["executor_options"]
         assert opts["max_rows_per_partition"] == 42
         # Reserved keys are still injected by ``_reset``.
-        assert opts["runtime"] == "rapidsmpf"
         assert opts["cluster"] == "spmd"
         assert isinstance(opts["spmd_context"], SPMDContext)
 
diff --git a/python/cudf_polars/tests/experimental/test_unique.py b/python/cudf_polars/tests/experimental/test_unique.py
index 49d2b580300..6bb30624cb6 100644
--- a/python/cudf_polars/tests/experimental/test_unique.py
+++ b/python/cudf_polars/tests/experimental/test_unique.py
@@ -34,12 +34,9 @@ def df():
 @pytest.mark.parametrize("subset", [None, ("y",), ("y", "z")])
 @pytest.mark.parametrize("keep", ["first", "last", "any", "none"])
 @pytest.mark.parametrize("maintain_order", [True, False])
-@pytest.mark.parametrize("cardinality", [{}, {"y": 0.7}])
-def test_unique(
-    df, streaming_engine_factory, keep, subset, maintain_order, cardinality
-):
+def test_unique(df, streaming_engine_factory, keep, subset, maintain_order):
     engine = streaming_engine_factory(
-        StreamingOptions(unique_fraction=cardinality, fallback_mode="warn"),
+        StreamingOptions(fallback_mode="warn"),
     )
     q = df.unique(subset=subset, keep=keep, maintain_order=maintain_order)
     check_row_order = maintain_order
@@ -50,40 +47,16 @@ def test_unique(
     assert_gpu_result_equal(q, engine=engine, check_row_order=check_row_order)
 
 
-def test_unique_fallback(df, streaming_engine_factory):
-    engine = streaming_engine_factory(
-        StreamingOptions(
-            unique_fraction={"y": 1.0},
-            fallback_mode="raise",
-            dynamic_planning=None,
-        ),
-    )
-    q = df.unique(keep="first", maintain_order=True)
-    with pytest.raises(
-        NotImplementedError,
-        match="Unsupported unique options",
-    ):
-        assert_gpu_result_equal(q, engine=engine)
-
-
 @pytest.mark.parametrize("maintain_order", [True, False])
-@pytest.mark.parametrize("cardinality", [{}, {"y": 0.5}])
-def test_unique_select(df, streaming_engine_factory, maintain_order, cardinality):
+def test_unique_select(df, streaming_engine_factory, maintain_order):
     engine = streaming_engine_factory(
         StreamingOptions(
             max_rows_per_partition=4,
-            unique_fraction=cardinality,
             fallback_mode="warn",
         ),
     )
     q = df.select(pl.col("y").unique(maintain_order=maintain_order))
-    if cardinality == {"y": 0.5} and maintain_order:
-        with pytest.warns(
-            UserWarning, match="Unsupported unique options for multiple partitions."
-        ):
-            assert_gpu_result_equal(q, engine=engine, check_row_order=False)
-    else:
-        assert_gpu_result_equal(q, engine=engine, check_row_order=False)
+    assert_gpu_result_equal(q, engine=engine, check_row_order=False)
 
 
 @pytest.mark.parametrize("keep", ["first", "last", "any"])
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
index 3cd66bc527d..6004c5eef40 100644
--- a/python/cudf_polars/tests/test_config.py
+++ b/python/cudf_polars/tests/test_config.py
@@ -4,7 +4,7 @@
 from __future__ import annotations
 
 import sys
-from typing import Any, cast
+from typing import cast
 
 import pytest
 
@@ -35,20 +35,7 @@
     StreamingExecutor,
     _default_cuda_stream_policy,
 )
-from cudf_polars.utils.cuda_stream import (
-    get_cuda_stream,
-    get_dask_cuda_stream,
-)
-
-
-@pytest.fixture(params=[False, True], ids=["norapidsmpf.single", "rapidsmpf.single"])
-def rapidsmpf_single_available(request, monkeypatch):
-    monkeypatch.setattr(
-        cudf_polars.utils.config,
-        "rapidsmpf_single_available",
-        lambda: request.param,
-    )
-    return request.param
+from cudf_polars.utils.cuda_stream import get_cuda_stream
 
 
 def test_polars_verbose_warns(monkeypatch):
@@ -232,47 +219,6 @@ def test_parquet_options_from_none() -> None:
     assert config.parquet_options.chunked is True
 
 
-def test_validate_streaming_executor_shuffle_method(
-    *, rapidsmpf_single_available: bool
-) -> None:
-    config = ConfigOptions.from_polars_engine(
-        pl.GPUEngine(
-            executor="streaming",
-            executor_options={"shuffle_method": "tasks"},
-        )
-    )
-    assert config.executor.name == "streaming"
-    assert config.executor.shuffle_method == "tasks"
-
-    # rapidsmpf with single cluster
-    engine = pl.GPUEngine(
-        executor="streaming",
-        executor_options={"shuffle_method": "rapidsmpf", "cluster": "single"},
-    )
-
-    if rapidsmpf_single_available:
-        config = ConfigOptions.from_polars_engine(engine)
-        assert config.executor.name == "streaming"
-        assert config.executor.shuffle_method == "rapidsmpf-single"
-    else:
-        with pytest.raises(ValueError, match="rapidsmpf is not installed"):
-            ConfigOptions.from_polars_engine(engine)
-
-
-def test_join_rapidsmpf_single_private_config() -> None:
-    # The user may not specify "rapidsmpf-single" directly
-    engine = pl.GPUEngine(
-        raise_on_fail=True,
-        executor="streaming",
-        executor_options={
-            "shuffle_method": "rapidsmpf-single",
-            "runtime": "tasks",
-        },
-    )
-    with pytest.raises(ValueError, match="not a supported shuffle method"):
-        ConfigOptions.from_polars_engine(engine)
-
-
 @pytest.mark.parametrize("executor", ["in-memory", "streaming"])
 def test_hashable(executor: str) -> None:
     config = ConfigOptions.from_polars_engine(
@@ -319,31 +265,11 @@ def test_validate_cluster() -> None:
         )
 
 
-def test_validate_shuffle_method_defaults() -> None:
-    config = ConfigOptions.from_polars_engine(
-        pl.GPUEngine(
-            executor="streaming",
-        )
-    )
-    assert config.executor.name == "streaming"
-    assert config.executor.shuffle_method == "tasks"  # Default for single cluster
-
-    with pytest.raises(ValueError, match="'foo' is not a valid ShuffleMethod"):
-        ConfigOptions.from_polars_engine(
-            pl.GPUEngine(
-                executor="streaming",
-                executor_options={"shuffle_method": "foo"},
-            )
-        )
-
-
 @pytest.mark.parametrize(
     "option",
     [
         "max_rows_per_partition",
-        "unique_fraction",
         "target_partition_size",
-        "groupby_n_ary",
         "broadcast_join_limit",
         "sink_to_directory",
         "client_device_threshold",
@@ -409,11 +335,8 @@ def test_config_option_from_env(monkeypatch: pytest.MonkeyPatch) -> None:
         m.setenv("CUDF_POLARS__EXECUTOR__CLUSTER", "single")
         m.setenv("CUDF_POLARS__EXECUTOR__FALLBACK_MODE", "silent")
         m.setenv("CUDF_POLARS__EXECUTOR__MAX_ROWS_PER_PARTITION", "42")
-        m.setenv("CUDF_POLARS__EXECUTOR__UNIQUE_FRACTION", '{"a": 0.5}')
         m.setenv("CUDF_POLARS__EXECUTOR__TARGET_PARTITION_SIZE", "100")
-        m.setenv("CUDF_POLARS__EXECUTOR__GROUPBY_N_ARY", "43")
         m.setenv("CUDF_POLARS__EXECUTOR__BROADCAST_JOIN_LIMIT", "44")
-        m.setenv("CUDF_POLARS__EXECUTOR__SHUFFLE_METHOD", "tasks")
         m.setenv("CUDF_POLARS__CUDA_STREAM_POLICY", "default")
 
         engine = pl.GPUEngine()
@@ -422,11 +345,8 @@ def test_config_option_from_env(monkeypatch: pytest.MonkeyPatch) -> None:
         assert config.executor.cluster == "single"
         assert config.executor.fallback_mode == "silent"
         assert config.executor.max_rows_per_partition == 42
-        assert config.executor.unique_fraction == {"a": 0.5}
         assert config.executor.target_partition_size == 100
-        assert config.executor.groupby_n_ary == 43
         assert config.executor.broadcast_join_limit == 44
-        assert config.executor.shuffle_method == "tasks"
         assert config.cuda_stream_policy is None
 
 
@@ -498,12 +418,6 @@ def test_default_executor() -> None:
     assert config.executor.name == "streaming"
 
 
-def test_default_runtime() -> None:
-    config = ConfigOptions.from_polars_engine(pl.GPUEngine())
-    assert config.executor.name == "streaming"
-    assert config.executor.runtime == "tasks"
-
-
 @pytest.mark.parametrize(
     "memory_resource, memory_resource_config",
     [
@@ -537,10 +451,7 @@ def test_memory_resource(memory_resource, memory_resource_config) -> None:
         if memory_resource is None and memory_resource_config is None:
             # The default case: We make a new RMM MR, whose type depends on the GPU's features.
 
-            if _is_concurrent_managed_access_supported():
-                assert isinstance(result, rmm.mr.PrefetchResourceAdaptor)
-            else:
-                assert isinstance(result, rmm.mr.CudaAsyncMemoryResource)
+            assert isinstance(result, rmm.mr.CudaAsyncMemoryResource)
 
         elif memory_resource is None:
             # Configured through memory_resource_config
@@ -608,21 +519,17 @@ def test_cuda_stream_pool():
 def test_cuda_stream_policy_default(monkeypatch: pytest.MonkeyPatch) -> None:
     # Default from engine
     config = ConfigOptions.from_polars_engine(pl.GPUEngine())
-    assert config.cuda_stream_policy is None
+    assert isinstance(config.cuda_stream_policy, CUDAStreamPoolConfig)
 
-    config = ConfigOptions.from_polars_engine(
-        pl.GPUEngine(executor_options={"runtime": "tasks"})
-    )
-    assert config.cuda_stream_policy is None
+    config = ConfigOptions.from_polars_engine(pl.GPUEngine(executor="streaming"))
+    assert isinstance(config.cuda_stream_policy, CUDAStreamPoolConfig)
 
     # Default from env
     monkeypatch.setenv("CUDF_POLARS__CUDA_STREAM_POLICY", "default")
     config = ConfigOptions.from_polars_engine(pl.GPUEngine())
     assert config.cuda_stream_policy is None
 
-    config = ConfigOptions.from_polars_engine(
-        pl.GPUEngine(executor_options={"runtime": "tasks"})
-    )
+    config = ConfigOptions.from_polars_engine(pl.GPUEngine(executor="streaming"))
     assert config.cuda_stream_policy is None
 
 
@@ -635,26 +542,19 @@ def test_default_cuda_stream_policy(monkeypatch: pytest.MonkeyPatch) -> None:
     assert isinstance(result, CUDAStreamPoolConfig)
 
 
-def test_cuda_stream_policy_from_config(*, rapidsmpf_single_available: bool) -> None:
+def test_cuda_stream_policy_from_config() -> None:
     engine = pl.GPUEngine(
         executor="streaming",
-        executor_options={"runtime": "rapidsmpf"},
         cuda_stream_policy={
             "pool_size": 32,
             "flags": rmm.pylibrmm.CudaStreamFlags.NON_BLOCKING,
         },
     )
-    if rapidsmpf_single_available:
-        config = ConfigOptions.from_polars_engine(engine)
-        assert isinstance(config.cuda_stream_policy, CUDAStreamPoolConfig)
-        assert config.cuda_stream_policy.pool_size == 32
-        assert (
-            config.cuda_stream_policy.flags == rmm.pylibrmm.CudaStreamFlags.NON_BLOCKING
-        )
-        config.cuda_stream_policy.build().get_stream()  # no exception
-    else:
-        with pytest.raises(ValueError, match="The rapidsmpf streaming engine"):
-            ConfigOptions.from_polars_engine(engine)
+    config = ConfigOptions.from_polars_engine(engine)
+    assert isinstance(config.cuda_stream_policy, CUDAStreamPoolConfig)
+    assert config.cuda_stream_policy.pool_size == 32
+    assert config.cuda_stream_policy.flags == rmm.pylibrmm.CudaStreamFlags.NON_BLOCKING
+    config.cuda_stream_policy.build().get_stream()  # no exception
 
 
 @pytest.mark.parametrize(
@@ -667,26 +567,19 @@ def test_cuda_stream_policy_from_config(*, rapidsmpf_single_available: bool) ->
         '{"pool_size": 32}',
     ],
 )
-def test_cuda_stream_policy_from_env(
-    monkeypatch: pytest.MonkeyPatch, env: str, *, rapidsmpf_single_available: bool
-) -> None:
+def test_cuda_stream_policy_from_env(monkeypatch: pytest.MonkeyPatch, env: str) -> None:
     monkeypatch.setenv("CUDF_POLARS__CUDA_STREAM_POLICY", env)
-    runtime = "tasks" if env == "default" else "rapidsmpf"
-    engine = pl.GPUEngine(executor="streaming", executor_options={"runtime": runtime})
-    if runtime == "rapidsmpf" and rapidsmpf_single_available:
-        config = ConfigOptions.from_polars_engine(engine)
+    engine = pl.GPUEngine(executor="streaming")
+    config = ConfigOptions.from_polars_engine(engine)
+    if env == "default":
+        assert config.cuda_stream_policy is None
+    else:
         assert isinstance(config.cuda_stream_policy, CUDAStreamPoolConfig)
         if env == "pool":
             assert config.cuda_stream_policy.pool_size == 16
             assert config.cuda_stream_policy.flags == CudaStreamFlags.NON_BLOCKING
         else:
             assert config.cuda_stream_policy.pool_size == 32
-    elif runtime == "rapidsmpf":
-        with pytest.raises(ValueError, match="The rapidsmpf streaming engine"):
-            ConfigOptions.from_polars_engine(engine)
-    else:
-        config = ConfigOptions.from_polars_engine(engine)
-        assert config.cuda_stream_policy is None
 
 
 def test_cuda_stream_policy_from_env_invalid(monkeypatch: pytest.MonkeyPatch):
@@ -696,41 +589,26 @@ def test_cuda_stream_policy_from_env_invalid(monkeypatch: pytest.MonkeyPatch):
 
 
 def test_cuda_stream_policy_default_rapidsmpf(monkeypatch: pytest.MonkeyPatch) -> None:
-    pytest.importorskip("rapidsmpf")
-
     # Default from engine
-    config = ConfigOptions.from_polars_engine(
-        pl.GPUEngine(executor_options={"runtime": "rapidsmpf"})
-    )
+    config = ConfigOptions.from_polars_engine(pl.GPUEngine(executor="streaming"))
     assert isinstance(config.cuda_stream_policy, CUDAStreamPoolConfig)
     assert config.cuda_stream_policy.pool_size == 16
     assert config.cuda_stream_policy.flags == rmm.pylibrmm.CudaStreamFlags.NON_BLOCKING
 
     # "default" user argument overrides pool default
     monkeypatch.setenv("CUDF_POLARS__CUDA_STREAM_POLICY", "default")
-    config = ConfigOptions.from_polars_engine(
-        pl.GPUEngine(executor_options={"runtime": "rapidsmpf"})
-    )
+    config = ConfigOptions.from_polars_engine(pl.GPUEngine(executor="streaming"))
     assert config.cuda_stream_policy is None
 
 
-@pytest.mark.parametrize(
-    "polars_kwargs",
-    [
-        {"executor": "in-memory"},
-        {"executor": "streaming", "executor_options": {"runtime": "tasks"}},
-    ],
-)
-def test_cuda_stream_policy_pool_only_supported_by_rapidsmpf(
-    polars_kwargs: dict[str, Any],
-) -> None:
+def test_cuda_stream_policy_pool_in_memory_unsupported() -> None:
     with pytest.raises(
         ValueError,
-        match="A stream pool is only supported by the rapidsmpf runtime.",
+        match="A stream pool is only supported by the streaming executor.",
     ):
         ConfigOptions.from_polars_engine(
             pl.GPUEngine(
-                **polars_kwargs,
+                executor="in-memory",
                 cuda_stream_policy={"pool_size": 32, "flags": "NON_BLOCKING"},
             )
         )
@@ -903,8 +781,3 @@ def test_dask_sink_to_directory_false_raises() -> None:
         ValueError, match="The dask cluster requires sink_to_directory=True"
     ):
         StreamingExecutor(cluster=Cluster.DASK, sink_to_directory=False)
-
-
-def test_get_dask_cuda_stream() -> None:
-    stream = get_dask_cuda_stream()
-    assert stream is not None
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index e3e788f2866..a655efbe422 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -718,7 +718,7 @@ def test_scan_parquet_zero_width_with_limit(
 ):
     request.applymarker(
         pytest.mark.xfail(
-            is_streaming_engine(engine) and custom_engine is None,
+            is_streaming_engine(engine) or custom_engine is not None,
             reason="https://github.com/rapidsai/cudf/issues/21644",
         )
     )
diff --git a/python/cudf_polars/tests/test_sink.py b/python/cudf_polars/tests/test_sink.py
index 7b69f6904b4..d23559d2134 100644
--- a/python/cudf_polars/tests/test_sink.py
+++ b/python/cudf_polars/tests/test_sink.py
@@ -157,6 +157,7 @@ def test_chunked_sink_empty_table_to_parquet(tmp_path):
         pl.LazyFrame(),
         tmp_path / "out.parquet",
         engine=pl.GPUEngine(
+            executor="in-memory",
             raise_on_fail=True,
             parquet_options={"chunked": True, "n_output_chunks": 2},
         ),
diff --git a/python/cudf_polars/tests/test_tracing.py b/python/cudf_polars/tests/test_tracing.py
index 184c0a77d38..283ca361682 100644
--- a/python/cudf_polars/tests/test_tracing.py
+++ b/python/cudf_polars/tests/test_tracing.py
@@ -55,9 +55,10 @@ def test_trace_basic(
     assert b"frames_input" in result
     assert b"total_bytes_output" in result
     assert b"total_bytes_input" in result
-    assert b"rmm_total_bytes_output" in result
-    assert b"rmm_total_bytes_input" in result
-    assert b"rmm_current_bytes_output" in result
+    # TODO: With rapidsmpf are the rmm fields not supposed to be logged?
+    assert b"rmm_total_bytes_output" not in result
+    assert b"rmm_total_bytes_input" not in result
+    assert b"rmm_current_bytes_output" not in result
     assert b"overhead_duration" in result
 
 
@@ -79,10 +80,6 @@ def test_import_without_structlog() -> None:
     subprocess.check_call([sys.executable, "-c", code])
 
 
-@pytest.mark.skipif(
-    cudf_polars.testing.asserts.DEFAULT_RUNTIME != "rapidsmpf",
-    reason="Requires 'rapidsmpf' runtime.",
-)
 def test_log_query_plan() -> None:
     """Test that log_query_plan emits a Query Plan event."""
     import os
@@ -98,7 +95,6 @@ def test_log_query_plan() -> None:
         executor="streaming",
         executor_options={
             "cluster": "single",
-            "runtime": "rapidsmpf",
             "max_rows_per_partition": 5,
         },
         memory_resource=rmm.mr.ManagedMemoryResource(),
@@ -126,7 +122,6 @@ def test_log_query_plan() -> None:
     reason="Requires CUDF_POLARS_LOG_TRACES=1.",
 )
 def test_sets_cudf_polars_query_id():
-    pytest.importorskip("rapidsmpf")
     left = pl.LazyFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
     right = pl.LazyFrame({"a": [1, 2, 3], "c": [7, 8, 9]})
 
@@ -136,7 +131,6 @@ def test_sets_cudf_polars_query_id():
     engine = pl.GPUEngine(
         executor="streaming",
         raise_on_fail=True,
-        executor_options={"runtime": "rapidsmpf"},
     )
 
     with structlog.testing.capture_logs(
diff --git a/python/cudf_polars/tests/testing/test_engine_utils.py b/python/cudf_polars/tests/testing/test_engine_utils.py
index faf113502d6..346a11acf2e 100644
--- a/python/cudf_polars/tests/testing/test_engine_utils.py
+++ b/python/cudf_polars/tests/testing/test_engine_utils.py
@@ -3,8 +3,6 @@
 
 from __future__ import annotations
 
-import pytest
-
 from cudf_polars.testing.engine_utils import (
     EngineFixtureParam,
     create_streaming_options,
@@ -30,7 +28,6 @@ def test_engine_fixture_param_small_blocksize():
 
 
 def test_create_streaming_options_medium():
-    pytest.importorskip("rapidsmpf")
     opts = create_streaming_options("medium")
     assert opts.max_rows_per_partition == 50
     assert opts.target_partition_size == 1_000_000
@@ -38,7 +35,6 @@ def test_create_streaming_options_medium():
 
 
 def test_create_streaming_options_small():
-    pytest.importorskip("rapidsmpf")
     opts = create_streaming_options("small")
     assert opts.max_rows_per_partition == 4
     assert opts.target_partition_size == 10
@@ -46,7 +42,6 @@ def test_create_streaming_options_small():
 
 def test_create_streaming_options_overrides_merge():
     """Overrides take precedence over the blocksize baseline."""
-    pytest.importorskip("rapidsmpf")
     from cudf_polars.experimental.rapidsmpf.frontend.options import StreamingOptions
 
     overrides = StreamingOptions(max_rows_per_partition=999)

From f49d5e8d4bd06480db547dfa70b367095bf99b92 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 6 May 2026 20:00:13 -0700
Subject: [PATCH 32/36] Use thread pool to submit hybrid scan host IO tasks
 (#21992)

This PR uses the host worker pool to submit hybrid scan's host-read IO tasks so that the mutex can be safely released after submission.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Tianyu Liu (https://github.com/kingcrimsontianyu)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/21992
---
 cpp/src/io/parquet/io_utils/parquet_io_utils.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/parquet/io_utils/parquet_io_utils.cpp b/cpp/src/io/parquet/io_utils/parquet_io_utils.cpp
index 3e67b49d03e..9b6953b4bd1 100644
--- a/cpp/src/io/parquet/io_utils/parquet_io_utils.cpp
+++ b/cpp/src/io/parquet/io_utils/parquet_io_utils.cpp
@@ -6,6 +6,7 @@
 #include "io/comp/common.hpp"
 #include "io/parquet/parquet_common.hpp"
 
+#include <cudf/detail/utilities/host_worker_pool.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/parquet.hpp>
@@ -16,8 +17,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuda/iterator>
 #include <cuda/std/tuple>
-#include <thrust/iterator/zip_iterator.h>
 
 #include <numeric>
 
@@ -118,8 +119,7 @@ fetch_byte_ranges_to_device_async(
   stream.synchronize();
 
   {
-    auto iter =
-      thrust::make_zip_iterator(io_offsets.begin(), io_sizes.begin(), destinations.begin());
+    auto iter = cuda::make_zip_iterator(io_offsets.begin(), io_sizes.begin(), destinations.begin());
 
     std::lock_guard<std::mutex> lock(mutex);
 
@@ -128,16 +128,14 @@ fetch_byte_ranges_to_device_async(
       auto const io_size   = cuda::std::get<1>(tuple);
       auto const dest      = cuda::std::get<2>(tuple);
 
-      // Directly read the column chunk data to the device
-      // buffer if supported
+      // Directly read the column chunk data to the device buffer if supported
       if (datasource.supports_device_read() and datasource.is_device_read_preferred(io_size)) {
         device_read_tasks.emplace_back(
           datasource.device_read_async(io_offset, io_size, dest, stream));
       } else {
-        // Read the column chunk data to the host buffer and
-        // copy it to the device buffer
-        host_read_tasks.emplace_back(
-          std::async(std::launch::deferred, [&datasource, io_offset, io_size, dest, stream]() {
+        // Read the column chunk data to the host buffer copy it to the device buffer
+        host_read_tasks.emplace_back(cudf::detail::host_worker_pool().submit_task(
+          [&datasource, io_offset, io_size, dest, stream]() {
             auto host_buffer = datasource.host_read(io_offset, io_size);
             cudf::detail::cuda_memcpy_async(
               cudf::device_span<uint8_t>{dest, io_size},

From 50cee5ba65909e7913f88cfa4cc95dfb3755bc3c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 6 May 2026 23:03:11 -0700
Subject: [PATCH 33/36] Python bindings and pytests for
 `cudf::apply_deletion_mask` (#22145)

Follow up #22144

Adds Python bindings for the `cudf::apply_deletion_mask` API and adds pytests for stream compaction.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/22145
---
 .../libcudf/lists/stream_compaction.pxd       |  7 ++
 .../pylibcudf/libcudf/stream_compaction.pxd   |  7 ++
 python/pylibcudf/pylibcudf/lists.pxd          |  7 ++
 python/pylibcudf/pylibcudf/lists.pyi          |  6 ++
 python/pylibcudf/pylibcudf/lists.pyx          | 42 +++++++++++
 .../pylibcudf/pylibcudf/stream_compaction.pxd |  7 ++
 .../pylibcudf/pylibcudf/stream_compaction.pyi |  6 ++
 .../pylibcudf/pylibcudf/stream_compaction.pyx | 36 ++++++++++
 .../pylibcudf/tests/test_stream_compaction.py | 69 +++++++++++++++++++
 9 files changed, 187 insertions(+)
 create mode 100644 python/pylibcudf/tests/test_stream_compaction.py

diff --git a/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
index 0187642e0c7..7514f9d159a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
@@ -19,6 +19,13 @@ cdef extern from "cudf/lists/stream_compaction.hpp" \
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
+    cdef unique_ptr[column] apply_deletion_mask(
+        const lists_column_view& lists_column,
+        const lists_column_view& deletion_mask,
+        cudaStream_t stream,
+        device_async_resource_ref mr
+    ) except +libcudf_exception_handler
+
     cdef unique_ptr[column] distinct(
         const lists_column_view& lists_column,
         null_equality nulls_equal,
diff --git a/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
index 9f8686da472..9b5f6d287f3 100644
--- a/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
@@ -48,6 +48,13 @@ cdef extern from "cudf/stream_compaction.hpp" namespace "cudf" nogil:
         device_async_resource_ref mr
     ) except +libcudf_exception_handler
 
+    cdef unique_ptr[table] apply_deletion_mask(
+        table_view source_table,
+        column_view deletion_mask,
+        cudaStream_t stream,
+        device_async_resource_ref mr
+    ) except +libcudf_exception_handler
+
     cdef unique_ptr[table] unique(
         table_view input,
         vector[size_type] keys,
diff --git a/python/pylibcudf/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd
index 88b09c01531..75db812de14 100644
--- a/python/pylibcudf/pylibcudf/lists.pxd
+++ b/python/pylibcudf/pylibcudf/lists.pxd
@@ -150,6 +150,13 @@ cpdef Column apply_boolean_mask(
     DeviceMemoryResource mr=*,
 )
 
+cpdef Column apply_deletion_mask(
+    Column,
+    Column,
+    object stream=*,
+    DeviceMemoryResource mr=*,
+)
+
 cpdef Column distinct(
     Column,
     null_equality,
diff --git a/python/pylibcudf/pylibcudf/lists.pyi b/python/pylibcudf/pylibcudf/lists.pyi
index 1e418b59726..6ff27345854 100644
--- a/python/pylibcudf/pylibcudf/lists.pyi
+++ b/python/pylibcudf/pylibcudf/lists.pyi
@@ -131,6 +131,12 @@ def apply_boolean_mask(
     stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Column: ...
+def apply_deletion_mask(
+    input: Column,
+    deletion_mask: Column,
+    stream: CudaStreamLike | None = None,
+    mr: DeviceMemoryResource | None = None,
+) -> Column: ...
 def distinct(
     input: Column,
     nulls_equal: NullEquality,
diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
index fd05242e44f..fbc07eebb8a 100644
--- a/python/pylibcudf/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -32,6 +32,7 @@ from pylibcudf.libcudf.lists.sorting cimport (
 )
 from pylibcudf.libcudf.lists.stream_compaction cimport (
     apply_boolean_mask as cpp_apply_boolean_mask,
+    apply_deletion_mask as cpp_apply_deletion_mask,
     distinct as cpp_distinct,
 )
 from pylibcudf.libcudf.stream_compaction cimport duplicate_keep_option
@@ -61,6 +62,7 @@ __all__ = [
     "ConcatenateNullPolicy",
     "DuplicateFindOption",
     "apply_boolean_mask",
+    "apply_deletion_mask",
     "concatenate_list_elements",
     "concatenate_rows",
     "contains",
@@ -833,6 +835,46 @@ cpdef Column apply_boolean_mask(
     return Column.from_libcudf(move(c_result), _stream, mr)
 
 
+cpdef Column apply_deletion_mask(
+    Column input,
+    Column deletion_mask,
+    object stream=None,
+    DeviceMemoryResource mr=None,
+):
+    """Filters elements in each row of the input lists column using a deletion mask.
+
+    For details, see :cpp:func:`apply_deletion_mask`.
+
+    Parameters
+    ----------
+    input : Column
+        The input lists column.
+    deletion_mask : Column
+        A lists-of-bools column used as a deletion mask.
+
+    Returns
+    -------
+    Column
+        Lists column with elements removed where deletion_mask is true.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListsColumnView list_view = input.list_view()
+    cdef ListsColumnView mask_view = deletion_mask.list_view()
+
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
+    mr = _get_memory_resource(mr)
+
+    with nogil:
+        c_result = cpp_apply_deletion_mask(
+            list_view.view(),
+            mask_view.view(),
+            _cs,
+            mr.get_mr(),
+        )
+    return Column.from_libcudf(move(c_result), _stream, mr)
+
+
 cpdef Column distinct(
     Column input,
     null_equality nulls_equal,
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/stream_compaction.pxd
index 6e904e11ce1..ffe36cebfbd 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pxd
@@ -37,6 +37,13 @@ cpdef Table apply_boolean_mask(
     DeviceMemoryResource mr = *,
 )
 
+cpdef Table apply_deletion_mask(
+    Table source_table,
+    Column deletion_mask,
+    object stream = *,
+    DeviceMemoryResource mr = *,
+)
+
 cpdef Table unique(
     Table input,
     list keys,
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyi b/python/pylibcudf/pylibcudf/stream_compaction.pyi
index afdd692dde2..76e669f8995 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pyi
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyi
@@ -37,6 +37,12 @@ def apply_boolean_mask(
     stream: CudaStreamLike | None = None,
     mr: DeviceMemoryResource | None = None,
 ) -> Table: ...
+def apply_deletion_mask(
+    source_table: Table,
+    deletion_mask: Column,
+    stream: CudaStreamLike | None = None,
+    mr: DeviceMemoryResource | None = None,
+) -> Table: ...
 def unique(
     input: Table,
     keys: list[int],
diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx
index b4751078acb..2fe8705ea52 100644
--- a/python/pylibcudf/pylibcudf/stream_compaction.pyx
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx
@@ -29,6 +29,7 @@ from cuda.bindings.cyruntime cimport cudaStream_t
 __all__ = [
     "DuplicateKeepOption",
     "apply_boolean_mask",
+    "apply_deletion_mask",
     "distinct",
     "distinct_indices",
     "drop_nans",
@@ -151,6 +152,41 @@ cpdef Table apply_boolean_mask(
     return Table.from_libcudf(move(c_result), _stream, mr)
 
 
+cpdef Table apply_deletion_mask(
+    Table source_table,
+    Column deletion_mask,
+    object stream=None,
+    DeviceMemoryResource mr=None,
+):
+    """Filters out rows from the input table using a deletion mask.
+
+    For details, see :cpp:func:`apply_deletion_mask`.
+
+    Parameters
+    ----------
+    source_table : Table
+        The input table to filter.
+    deletion_mask : Column
+        A boolean column used as a deletion mask.
+
+    Returns
+    -------
+    Table
+        Table with rows removed where deletion_mask is true.
+    """
+    cdef unique_ptr[table] c_result
+
+    cdef Stream _stream = _get_stream(stream)
+    cdef cudaStream_t _cs = _stream.view().value()
+    mr = _get_memory_resource(mr)
+
+    with nogil:
+        c_result = cpp_stream_compaction.apply_deletion_mask(
+            source_table.view(), deletion_mask.view(), _cs, mr.get_mr()
+        )
+    return Table.from_libcudf(move(c_result), _stream, mr)
+
+
 cpdef Table unique(
     Table input,
     list keys,
diff --git a/python/pylibcudf/tests/test_stream_compaction.py b/python/pylibcudf/tests/test_stream_compaction.py
new file mode 100644
index 00000000000..ccf21c2a6b3
--- /dev/null
+++ b/python/pylibcudf/tests/test_stream_compaction.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+# SPDX-License-Identifier: Apache-2.0
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq, assert_table_eq
+
+import pylibcudf as plc
+
+
+@pytest.fixture
+def lists_column_and_mask():
+    pa_input = pa.array(
+        [[0, 1, 2, 3], [4, 5], [6, 7, 8, 9]], type=pa.list_(pa.int32())
+    )
+    pa_mask = pa.array(
+        [
+            [True, False, True, False],
+            [True, False],
+            [True, False, True, False],
+        ],
+        type=pa.list_(pa.bool_()),
+    )
+    return pa_input, pa_mask
+
+
+def test_lists_apply_boolean_mask(lists_column_and_mask):
+    pa_input, pa_mask = lists_column_and_mask
+    result = plc.lists.apply_boolean_mask(
+        plc.Column.from_arrow(pa_input), plc.Column.from_arrow(pa_mask)
+    )
+    expected = pa.array([[0, 2], [4], [6, 8]], type=pa.list_(pa.int32()))
+    assert_column_eq(expected, result)
+
+
+def test_lists_apply_deletion_mask(lists_column_and_mask):
+    pa_input, pa_mask = lists_column_and_mask
+    result = plc.lists.apply_deletion_mask(
+        plc.Column.from_arrow(pa_input), plc.Column.from_arrow(pa_mask)
+    )
+    expected = pa.array([[1, 3], [5], [7, 9]], type=pa.list_(pa.int32()))
+    assert_column_eq(expected, result)
+
+
+def test_apply_boolean_mask():
+    pa_table = pa.table(
+        {
+            "a": pa.array([10, 40, 70, 5, 2, 10], type=pa.int32()),
+            "b": pa.array([10, 40, 70, 5, 2, 10], type=pa.float64()),
+        }
+    )
+    pa_mask = pa.array(
+        [True, False, True, False, True, False], type=pa.bool_()
+    )
+    result = plc.stream_compaction.apply_boolean_mask(
+        plc.Table.from_arrow(pa_table), plc.Column.from_arrow(pa_mask)
+    )
+    expected = pa_table.filter(pa_mask)
+    assert_table_eq(expected, result)
+
+
+def test_apply_deletion_mask():
+    pa_table = pa.table({"a": pa.array([1, 2, 3, 4, 5], type=pa.int32())})
+    pa_mask = pa.array([True, False, True, False, True], type=pa.bool_())
+    result = plc.stream_compaction.apply_deletion_mask(
+        plc.Table.from_arrow(pa_table), plc.Column.from_arrow(pa_mask)
+    )
+    expected = pa.table({"a": pa.array([2, 4], type=pa.int32())})
+    assert_table_eq(expected, result)

From 47b699df1f49a19de48455d0986bad67c3e46c73 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 7 May 2026 08:52:41 -0500
Subject: [PATCH 34/36] Refactor ``sort_actor`` to prepare for ``OrderScheme``
 changes (#22350)

- Follow up to https://github.com/rapidsai/cudf/pull/22315 - Further revises `sort_actor` in preparation for https://github.com/rapidsai/rapidsmpf/pull/853
- Part of https://github.com/rapidsai/cudf/issues/22128
- Breaks apart `sort_actor` logic into modular steps, so we can avoid collecting boundaries when we already know the boundaries (future work).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/22350
---
 .../rapidsmpf/collectives/sort.py             | 129 +++++++++++++-----
 1 file changed, 94 insertions(+), 35 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/collectives/sort.py b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/collectives/sort.py
index ffc10ea44c2..a950df3ce34 100644
--- a/python/cudf_polars/cudf_polars/experimental/rapidsmpf/collectives/sort.py
+++ b/python/cudf_polars/cudf_polars/experimental/rapidsmpf/collectives/sort.py
@@ -337,16 +337,25 @@ async def _receive_and_buffer_chunks(
     return local_candidates_list
 
 
+async def _forward_from_chunk_store(
+    context: Context, ch_out: Channel[TableChunk], chunk_store: ChunkStore
+) -> None:
+    """Forward buffered messages from a ChunkStore into a channel."""
+    for msg in chunk_store:
+        await ch_out.send(context, msg)
+    await ch_out.drain(context)
+
+
 async def _insert_chunks_into_shuffle(
     context: Context,
     comm: Communicator,
+    ir: Sort,
+    ir_context: IRExecutionContext,
+    ch_in: Channel[TableChunk],
     num_partitions: int,
     collective_ids: list[int],
     metadata_in: ChannelMetadata,
-    chunk_store: ChunkStore,
     sort_boundaries_df: DataFrame,
-    ir: Sort,
-    ir_context: IRExecutionContext,
     by: list[str],
 ) -> tuple[ShuffleManager, Sort]:
     """Create shuffle manager and insert each buffered chunk with sort-based splits."""
@@ -364,7 +373,7 @@ async def _insert_chunks_into_shuffle(
         partition_assignment=PartitionAssignment.CONTIGUOUS,
     )
     async with shuffle.inserting() as inserter:
-        for msg in chunk_store:
+        while (msg := await ch_in.recv(context)) is not None:
             if skip_insert:
                 continue
             seq_num = msg.sequence_number
@@ -379,6 +388,8 @@ async def _insert_chunks_into_shuffle(
                 upstreams=(available_chunk.stream, sort_boundaries_df.stream),
             )
 
+            # TODO: Pre-sort chunks if they do not originate from the ChunkStore.
+            # (Not possible until we use _global_sort outside of sort_actor.)
             splits = find_sort_splits(
                 sort_cols_tbl,
                 sort_boundaries_df.table,
@@ -453,6 +464,52 @@ async def _extract_partitions_and_send(
     await ch_out.drain(context)
 
 
+async def _global_sort(
+    context: Context,
+    comm: Communicator,
+    ir: Sort,
+    ir_context: IRExecutionContext,
+    ch_out: Channel[TableChunk],
+    ch_in: Channel[TableChunk],
+    metadata_in: ChannelMetadata,
+    by: list[str],
+    num_partitions: int,
+    sort_boundaries_df: DataFrame,
+    collective_ids: list[int],
+    *,
+    tracer: ActorTracer | None,
+) -> None:
+    """Global sort."""
+    # TODO: Attach OrderScheme metadata here.
+    output_metadata = ChannelMetadata(
+        local_count=max(1, num_partitions // comm.nranks),
+        partitioning=Partitioning(inter_rank=None, local="inherit"),
+    )
+    await send_metadata(ch_out, context, output_metadata)
+
+    shuffle, post_sort_ir = await _insert_chunks_into_shuffle(
+        context,
+        comm,
+        ir,
+        ir_context,
+        ch_in,
+        num_partitions,
+        collective_ids,
+        metadata_in,
+        sort_boundaries_df,
+        by,
+    )
+    await _extract_partitions_and_send(
+        context,
+        ch_out,
+        shuffle,
+        post_sort_ir,
+        ir_context,
+        ir.schema,
+        tracer=tracer,
+    )
+
+
 @define_actor()
 async def sort_actor(
     context: Context,
@@ -467,10 +524,18 @@ async def sort_actor(
     collective_ids: list[int],
 ) -> None:
     """Streaming sort actor."""
-    ch_replay = context.create_channel()
+    ch_sample_replay = context.create_channel()
+    ch_chunk_store = context.create_channel()
     async with shutdown_on_error(
-        context, ch_in, ch_out, ch_replay, trace_ir=ir, ir_context=ir_context
+        context,
+        ch_in,
+        ch_out,
+        ch_sample_replay,
+        ch_chunk_store,
+        trace_ir=ir,
+        ir_context=ir_context,
     ) as tracer:
+        # TODO: Skip sort if OrderScheme metadata is present and compatible.
         metadata_in = await recv_metadata(ch_in, context)
 
         if ir.zlice is not None:
@@ -494,20 +559,19 @@ async def sort_actor(
             context, comm, ch_in, num_partitions, metadata_in, executor, collective_ids
         )
 
-        output_metadata = ChannelMetadata(
-            local_count=max(1, num_partitions // comm.nranks),
-            partitioning=Partitioning(inter_rank=None, local="inherit"),
-        )
-        await send_metadata(ch_out, context, output_metadata)
-
         chunk_store = ChunkStore(context)
         _, local_candidates_list = await gather_in_task_group(
             replay_buffered_channel(
-                context, ch_replay, ch_in, sampled_chunks, metadata_in, trace_ir=ir
+                context,
+                ch_sample_replay,
+                ch_in,
+                sampled_chunks,
+                metadata_in,
+                trace_ir=ir,
             ),
             _receive_and_buffer_chunks(
                 context,
-                ch_replay,
+                ch_sample_replay,
                 chunk_store,
                 ir,
                 by,
@@ -529,27 +593,22 @@ async def sort_actor(
             collective_ids.pop() if need_allgather else None,
         )
 
-        shuffle, post_sort_ir = await _insert_chunks_into_shuffle(
-            context,
-            comm,
-            num_partitions,
-            collective_ids,
-            metadata_in,
-            chunk_store,
-            sort_boundaries_df,
-            ir,
-            ir_context,
-            by,
-        )
-
-        await _extract_partitions_and_send(
-            context,
-            ch_out,
-            shuffle,
-            post_sort_ir,
-            ir_context,
-            ir.schema,
-            tracer=tracer,
+        await gather_in_task_group(
+            _forward_from_chunk_store(context, ch_chunk_store, chunk_store),
+            _global_sort(
+                context,
+                comm,
+                ir,
+                ir_context,
+                ch_out,
+                ch_chunk_store,
+                metadata_in,
+                by,
+                num_partitions,
+                sort_boundaries_df,
+                collective_ids,
+                tracer=tracer,
+            ),
         )
 
 
From 996eb35f8a6775d1577c979f2c4b1385b958ad5b Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Thu, 7 May 2026 21:07:24 +0200
Subject: [PATCH 35/36] Run the cudf-polars test suite against `DaskEngine` and
 `RayEngine` (#22381)

Builds on the cached `streaming_engines` fixture from #22364, which amortizes SPMD bootstrap via `_reset()`, and extends the same pattern to Dask and Ray.

With this change, the test matrix runs against:

`["in-memory", "spmd", "spmd-small", "dask", "ray"]`

subject to package availability and `rrun` gating.

We might change the different setups later, but for now CI runs:

| Engine        | Block Size(s)         | GPU Configuration |
|----------------|-----------------------|-------------------|
| `SPMDEngine`   | `"medium"`, `"small"` | Single GPU        |
| `DaskEngine`   | `"medium"`            | Single GPU        |
| `RayEngine`    | `"medium"`            | Two GPUs          |

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Bradley Dice (https://github.com/bdice)
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/22381
---
 .github/workflows/pr.yaml                     | 89 ++++++++++---------
 .github/workflows/test.yaml                   |  1 +
 ci/run_cudf_polars_experimental_pytests.sh    |  2 +-
 ci/test_cudf_polars_experimental.sh           |  2 +-
 dependencies.yaml                             | 13 +++
 .../cudf_polars/experimental/join.py          | 20 +++--
 .../cudf_polars/testing/engine_utils.py       | 43 +++++++++
 python/cudf_polars/pyproject.toml             |  3 +
 python/cudf_polars/tests/conftest.py          | 72 +++++++++++++--
 .../experimental/test_all_gather_host_data.py |  2 -
 .../tests/experimental/test_dataframescan.py  | 21 ++---
 .../tests/experimental/test_filter.py         |  9 +-
 .../tests/experimental/test_groupby.py        |  8 +-
 .../tests/experimental/test_io_multirank.py   | 48 ++--------
 .../tests/experimental/test_join.py           | 52 ++++++-----
 .../tests/experimental/test_metadata.py       | 22 +++--
 .../tests/experimental/test_parallel.py       |  4 +-
 .../tests/experimental/test_rolling.py        | 13 ++-
 .../tests/experimental/test_select.py         | 25 ++++--
 .../tests/experimental/test_spilling.py       |  8 +-
 .../tests/experimental/test_statistics.py     | 53 ++---------
 .../tests/experimental/test_unique.py         | 13 +--
 22 files changed, 310 insertions(+), 213 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a7ef36049b0..f373953d5f1 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -470,50 +470,51 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
-  # wheel-build-cudf-polars:
-  #   needs: wheel-build-pylibcudf
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
-  #   with:
-  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-  #     build_type: pull-request
-  #     node_type: cpu8
-  #     script: "ci/build_wheel_cudf_polars.sh"
-  #     package-name: cudf_polars
-  #     package-type: python
-  #     pure-wheel: true
-  # wheel-tests-cudf-polars:
-  #   needs: [wheel-build-cudf-polars, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
-  #   if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels && fromJSON(needs.changed-files.outputs.changed_file_groups).neither_cudf_nor_dask_cudf
-  #   with:
-  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-  #     build_type: pull-request
-  #     script: "ci/test_wheel_cudf_polars.sh"
-  # wheel-tests-cudf-polars-with-rapidsmpf:
-  #   needs: [wheel-build-cudf-polars, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
-  #   if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels && fromJSON(needs.changed-files.outputs.changed_file_groups).neither_cudf_nor_dask_cudf
-  #   with:
-  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA" to minimize CI usage.
-  #     # (rapidsmpf compatibility already validated in rapidsmpf CI)
-  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-  #     build_type: pull-request
-  #     script: "ci/test_cudf_polars_experimental.sh"
-  # cudf-polars-polars-tests:
-  #   needs: [wheel-build-cudf-polars, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
-  #   if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels && fromJSON(needs.changed-files.outputs.changed_file_groups).neither_cudf_nor_dask_cudf
-  #   with:
-  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-  #     build_type: pull-request
-  #     script: "ci/test_cudf_polars_polars_tests.sh"
+  wheel-build-cudf-polars:
+    needs: wheel-build-pylibcudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@main
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      node_type: cpu8
+      script: "ci/build_wheel_cudf_polars.sh"
+      package-name: cudf_polars
+      package-type: python
+      pure-wheel: true
+  wheel-tests-cudf-polars:
+    needs: [wheel-build-cudf-polars, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels && fromJSON(needs.changed-files.outputs.changed_file_groups).neither_cudf_nor_dask_cudf
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      script: "ci/test_wheel_cudf_polars.sh"
+  wheel-tests-cudf-polars-with-rapidsmpf:
+    needs: [wheel-build-cudf-polars, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels && fromJSON(needs.changed-files.outputs.changed_file_groups).neither_cudf_nor_dask_cudf
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA" to minimize CI usage.
+      # (rapidsmpf compatibility already validated in rapidsmpf CI)
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000"
+      script: "ci/test_cudf_polars_experimental.sh"
+  cudf-polars-polars-tests:
+    needs: [wheel-build-cudf-polars, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@main
+    if: fromJSON(needs.changed-files.outputs.changed_file_groups).test_python_wheels && fromJSON(needs.changed-files.outputs.changed_file_groups).neither_cudf_nor_dask_cudf
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      script: "ci/test_cudf_polars_polars_tests.sh"
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index b4977f60def..a6b0b6f3326 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -174,6 +174,7 @@ jobs:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: ${{ inputs.build_type }}
       branch: ${{ inputs.branch }}
+      container-options: "--cap-add CAP_SYS_PTRACE --shm-size=8g --ulimit=nofile=1000000:1000000"
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: "ci/test_cudf_polars_experimental.sh"
diff --git a/ci/run_cudf_polars_experimental_pytests.sh b/ci/run_cudf_polars_experimental_pytests.sh
index d0a4767bd99..da659c7b386 100755
--- a/ci/run_cudf_polars_experimental_pytests.sh
+++ b/ci/run_cudf_polars_experimental_pytests.sh
@@ -10,5 +10,5 @@ set -euo pipefail
 # Support invoking outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/
 
-echo "Running the full cudf-polars test suite with both the in-memory and spmd engine"
+echo "Running the full cudf-polars test suite"
 python -m pytest --cache-clear "$@" tests
diff --git a/ci/test_cudf_polars_experimental.sh b/ci/test_cudf_polars_experimental.sh
index aa3abd66254..4b796ff4b94 100755
--- a/ci/test_cudf_polars_experimental.sh
+++ b/ci/test_cudf_polars_experimental.sh
@@ -28,7 +28,7 @@ rapids-pip-retry install \
     -v \
     --prefer-binary \
     --constraint "${PIP_CONSTRAINT}" \
-    "$(echo "${CUDF_POLARS_WHEELHOUSE}"/cudf_polars_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test,experimental]" \
+    "$(echo "${CUDF_POLARS_WHEELHOUSE}"/cudf_polars_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test,experimental,ray]" \
     "$(echo "${LIBCUDF_WHEELHOUSE}"/libcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
     "$(echo "${PYLIBCUDF_WHEELHOUSE}"/pylibcudf_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)"
 
diff --git a/dependencies.yaml b/dependencies.yaml
index af7dfea460f..9701fb3733b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -384,6 +384,14 @@ files:
       key: experimental
     includes:
       - run_cudf_polars_experimental
+  py_run_cudf_polars_ray:
+    output: pyproject
+    pyproject_dir: python/cudf_polars
+    extras:
+      table: project.optional-dependencies
+      key: ray
+    includes:
+      - depends_on_ray
   py_test_cudf_polars:
     output: pyproject
     pyproject_dir: python/cudf_polars
@@ -1290,6 +1298,11 @@ dependencies:
           - matrix:
             packages:
               - *rapidsmpf_unsuffixed
+  depends_on_ray:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - ray>=2.55.1
   depends_on_rapids_logger:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/experimental/join.py b/python/cudf_polars/cudf_polars/experimental/join.py
index cd5c514b45a..1682762c9e8 100644
--- a/python/cudf_polars/cudf_polars/experimental/join.py
+++ b/python/cudf_polars/cudf_polars/experimental/join.py
@@ -164,20 +164,22 @@ def _(
     left, pi_left = rec(left)
     right, pi_right = rec(right)
 
-    # Fallback to single partition on the smaller table
+    # Fallback to single partition on the smaller table whenever either
+    # side has more than one partition.
     left_count = pi_left[left].count
     right_count = pi_right[right].count
     output_count = max(left_count, right_count)
-    fallback_msg = "ConditionalJoin not supported for multiple partitions."
-    if left_count < right_count:
-        if left_count > 1 or dynamic_planning:
+    if output_count > 1 or dynamic_planning:
+        if left_count < right_count:
             left = Repartition(left.schema, left)
             pi_left[left] = PartitionInfo(count=1)
-            _fallback_inform(fallback_msg, config_options)
-    elif right_count > 1 or dynamic_planning:
-        right = Repartition(right.schema, right)
-        pi_right[right] = PartitionInfo(count=1)
-        _fallback_inform(fallback_msg, config_options)
+        else:
+            right = Repartition(right.schema, right)
+            pi_right[right] = PartitionInfo(count=1)
+        _fallback_inform(
+            "ConditionalJoin not supported for multiple partitions.",
+            config_options,
+        )
 
     # Reconstruct and return
     new_node = ir.reconstruct([left, right])
diff --git a/python/cudf_polars/cudf_polars/testing/engine_utils.py b/python/cudf_polars/cudf_polars/testing/engine_utils.py
index c36bcf2ed27..b0b640615f7 100644
--- a/python/cudf_polars/cudf_polars/testing/engine_utils.py
+++ b/python/cudf_polars/cudf_polars/testing/engine_utils.py
@@ -11,6 +11,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Mapping
+    from contextlib import AbstractContextManager
 
     import polars as pl
 
@@ -21,6 +22,15 @@
 STREAMING_ENGINE_FIXTURE_PARAMS: list[str] = []
 if importlib.util.find_spec("rapidsmpf") is not None:
     STREAMING_ENGINE_FIXTURE_PARAMS.extend(["spmd", "spmd-small"])
+    # ``DaskEngine`` and ``RayEngine`` both reject construction inside an
+    # ``rrun`` cluster.
+    from rapidsmpf.bootstrap import is_running_with_rrun as _is_running_with_rrun
+
+    if not _is_running_with_rrun():  # pragma: no cover
+        if importlib.util.find_spec("distributed") is not None:
+            STREAMING_ENGINE_FIXTURE_PARAMS.append("dask")
+        if importlib.util.find_spec("ray") is not None:
+            STREAMING_ENGINE_FIXTURE_PARAMS.append("ray")
 ALL_ENGINE_FIXTURE_PARAMS = ["in-memory", *STREAMING_ENGINE_FIXTURE_PARAMS]
 
 
@@ -63,6 +73,34 @@ def is_streaming_engine(obj: Any) -> bool:
     return isinstance(obj, StreamingEngine)
 
 
+def warns_on_spmd(  # pragma: no cover; rapidsmpf-only path
+    engine: Any,
+    *args: Any,
+    when: bool = True,
+    **kwargs: Any,
+) -> AbstractContextManager[Any]:
+    """
+    ``pytest.warns(*args, **kwargs)`` on SPMD; ``nullcontext`` otherwise.
+
+    ``pytest.warns`` only captures warnings emitted in the test process. On
+    multi-process backends (``DaskEngine``, ``RayEngine``) the fallback
+    warning fires on workers/actors and only appears in worker logs/stdout,
+    so the assertion is replaced with a passthrough on those backends.
+
+    The optional ``when`` kwarg lets callers compose an additional gate (e.g.
+    a parametrize value) without an outer ``if``.
+    """
+    import contextlib
+
+    import pytest
+
+    from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
+
+    if when and isinstance(engine, SPMDEngine):
+        return pytest.warns(*args, **kwargs)
+    return contextlib.nullcontext()
+
+
 def create_streaming_options(
     blocksize_mode: Literal["medium", "small"],
     overrides: StreamingOptions | None = None,
@@ -87,6 +125,9 @@ def create_streaming_options(
     from cudf_polars.experimental.rapidsmpf.frontend.options import StreamingOptions
     from cudf_polars.utils.config import StreamingFallbackMode
 
+    # ``allow_gpu_sharing=True`` is always set so the cached multi-rank
+    # engines (Dask workers, Ray actors with ``num_ranks > 1``) don't trip
+    # the UUID-collision guard on every ``_reset(...)``.
     match blocksize_mode:
         case "medium":
             baseline = StreamingOptions(
@@ -94,6 +135,7 @@ def create_streaming_options(
                 dynamic_planning={},
                 target_partition_size=1_000_000,
                 raise_on_fail=True,
+                allow_gpu_sharing=True,
             )
         case "small":
             baseline = StreamingOptions(
@@ -102,6 +144,7 @@ def create_streaming_options(
                 target_partition_size=10,
                 raise_on_fail=True,
                 fallback_mode=StreamingFallbackMode.SILENT,
+                allow_gpu_sharing=True,
             )
         case _:  # pragma: no cover
             raise ValueError(f"Unknown blocksize_mode: {blocksize_mode!r}")
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 47633e42364..7703cad7dad 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -63,6 +63,9 @@ rapidsmpf = [
     "pyarrow>=19.0.0,<24",
     "rapidsmpf==26.6.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+ray = [
+    "ray>=2.55.1",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"
diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py
index b3d83b36d36..65445b683ae 100644
--- a/python/cudf_polars/tests/conftest.py
+++ b/python/cudf_polars/tests/conftest.py
@@ -31,6 +31,12 @@
     StreamingEngines: TypeAlias = Mapping[str, StreamingEngine]
 
 
+# Number of ranks for multi-rank streaming engines that share one GPU
+# (currently ``RayEngine``). Single-GPU dev hosts and CI runners require
+# ``allow_gpu_sharing=True`` to oversubscribe one device across actors.
+NUM_RANKS = 2
+
+
 @pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"], scope="session")
 def with_nulls(request):
     return request.param
@@ -89,6 +95,27 @@ def streaming_engines() -> Generator[StreamingEngines, None, None]:
         )
 
     engines: dict[str, StreamingEngine] = {"spmd": SPMDEngine(comm=comm)}
+
+    if "dask" in STREAMING_ENGINE_FIXTURE_PARAMS:  # pragma: no cover
+        from cudf_polars.experimental.rapidsmpf.frontend.dask import DaskEngine
+
+        engines["dask"] = DaskEngine(engine_options={"allow_gpu_sharing": True})
+
+    if "ray" in STREAMING_ENGINE_FIXTURE_PARAMS:  # pragma: no cover
+        from cudf_polars.experimental.rapidsmpf.frontend.ray import RayEngine
+
+        # Always pin ``num_ranks`` so the cached engine has a deterministic
+        # actor count regardless of how many GPUs the host happens to have;
+        # otherwise ``RayEngine`` defaults to ``get_num_gpus_in_ray_cluster()``
+        # and tests that depend on rank-count behavior (e.g. fast-count
+        # parquet, concat) become non-portable. Pinning ``num_ranks`` requires
+        # ``allow_gpu_sharing=True`` (production guard).
+        engines["ray"] = RayEngine(
+            num_ranks=NUM_RANKS,
+            engine_options={"allow_gpu_sharing": True},
+            ray_init_options={"include_dashboard": False},
+        )
+
     try:
         yield engines
     finally:
@@ -108,6 +135,28 @@ def spmd_engine(streaming_engines: StreamingEngines) -> SPMDEngine:
     return engine
 
 
+@pytest.fixture
+def spmd_engine_factory(
+    streaming_engines: StreamingEngines,
+) -> Callable[..., SPMDEngine]:
+    """
+    Return a factory that yields the shared :class:`SPMDEngine`.
+
+    Use this in place of :func:`streaming_engine_factory` for tests that
+    must run on SPMD only.
+    """
+    from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
+
+    param = EngineFixtureParam(full_name="spmd")
+
+    def factory(options: StreamingOptions | None = None) -> SPMDEngine:
+        engine = build_streaming_engine(param, streaming_engines, options)
+        assert isinstance(engine, SPMDEngine)
+        return engine
+
+    return factory
+
+
 @pytest.fixture(params=STREAMING_ENGINE_FIXTURE_PARAMS)
 def _streaming_engine_param(request: pytest.FixtureRequest) -> EngineFixtureParam:
     """Parametrization helper to run tests for each streaming engine variant."""
@@ -246,10 +295,9 @@ def pytest_configure(config):
 
     config.addinivalue_line(
         "markers",
-        "skip_on_streaming_engine(reason): skip the test for streaming "
-        '``engine`` variants (e.g. ``"spmd"``, ``"spmd-small"``) while '
-        "still letting the in-memory variant run. Use this to track features "
-        "that have no multi-partition implementation",
+        "skip_on_streaming_engine(reason, *, engine=None): skip the test for "
+        'streaming ``engine`` variants (e.g. ``"spmd"``, ``"spmd-small"``, '
+        '``"dask"``, ``"ray"``) while still allowing the in-memory variant to run.',
     )
 
     # Ray's internal subprocess management leaks `/dev/null` file handles, and
@@ -275,9 +323,23 @@ def pytest_collection_modifyitems(items):
         callspec = getattr(item, "callspec", None)
         if callspec is None:
             continue
-        engine_param = callspec.params.get("_all_engine_param")
+        # Tests bind to either ``engine`` (parametrized via ``_all_engine_param``)
+        # or ``streaming_engine`` / ``streaming_engine_factory`` (parametrized via
+        # ``_streaming_engine_param``). Check both.
+        engine_param = callspec.params.get("_all_engine_param") or callspec.params.get(
+            "_streaming_engine_param"
+        )
         if engine_param is None or engine_param == "in-memory":
             continue
+        engine_filter = marker.kwargs.get("engine")
+        if engine_filter is not None:
+            if isinstance(engine_filter, str):
+                engine_filter = (engine_filter,)
+            # Strip the ``-small`` suffix so ``"spmd-small"`` matches
+            # ``engine=("spmd",)``.
+            engine_name = engine_param.removesuffix("-small")
+            if engine_name not in engine_filter:
+                continue
         reason = (
             marker.args[0]
             if marker.args
diff --git a/python/cudf_polars/tests/experimental/test_all_gather_host_data.py b/python/cudf_polars/tests/experimental/test_all_gather_host_data.py
index 8f09a82c4bd..c85598a8c64 100644
--- a/python/cudf_polars/tests/experimental/test_all_gather_host_data.py
+++ b/python/cudf_polars/tests/experimental/test_all_gather_host_data.py
@@ -59,8 +59,6 @@ def test_gather_cluster_info(streaming_engine) -> None:
         assert isinstance(info.gpu_uuid, str)
     # Each rank runs in its own process.
     assert len({info.pid for info in infos}) == streaming_engine.nranks
-    # Without allow_gpu_sharing, all UUIDs must be unique (enforced at init).
-    assert len({info.gpu_uuid for info in infos}) == streaming_engine.nranks
 
 
 def test_cluster_info_cuda_visible_devices(monkeypatch) -> None:
diff --git a/python/cudf_polars/tests/experimental/test_dataframescan.py b/python/cudf_polars/tests/experimental/test_dataframescan.py
index dbf22848824..fb263e20b94 100644
--- a/python/cudf_polars/tests/experimental/test_dataframescan.py
+++ b/python/cudf_polars/tests/experimental/test_dataframescan.py
@@ -60,19 +60,20 @@ def test_parallel_dataframescan(df, streaming_engine_factory, max_rows_per_parti
         assert count == 1
 
 
-@pytest.mark.xfail(
-    reason=(
-        "Multi-rank Union interleaves child outputs across ranks: client "
-        "receives [rank0_A, rank0_B, rank1_A, rank1_B] instead of the "
-        "polars-CPU [A, B]. Tracked in "
-        "https://github.com/rapidsai/cudf/issues/22376."
-    ),
-    strict=False,
-)
-def test_dataframescan_concat(df, streaming_engine_factory):
+def test_dataframescan_concat(request, df, streaming_engine_factory):
     streaming_engine = streaming_engine_factory(
         StreamingOptions(max_rows_per_partition=1_000),
     )
+    if streaming_engine.nranks > 1:
+        # Multi-rank Union interleaves child outputs across ranks: client
+        # receives [rank0_A, rank0_B, rank1_A, rank1_B] instead of the
+        # polars-CPU [A, B].
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/22376",
+                strict=False,
+            )
+        )
     df2 = pl.concat([df, df])
     assert_gpu_result_equal(df2, engine=streaming_engine)
 
diff --git a/python/cudf_polars/tests/experimental/test_filter.py b/python/cudf_polars/tests/experimental/test_filter.py
index 4fb11df691c..b8b4fb2749c 100644
--- a/python/cudf_polars/tests/experimental/test_filter.py
+++ b/python/cudf_polars/tests/experimental/test_filter.py
@@ -9,12 +9,11 @@
 
 from cudf_polars.experimental.rapidsmpf.frontend.options import StreamingOptions
 from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.engine_utils import warns_on_spmd
 
 
 @pytest.fixture
 def engine(streaming_engine_factory):
-    # ``fallback_mode="warn"`` overrides the small-blocksize baseline (which
-    # sets SILENT) so ``test_filter_non_pointwise`` can assert on the warning.
     return streaming_engine_factory(
         StreamingOptions(max_rows_per_partition=3, fallback_mode="warn"),
     )
@@ -38,7 +37,9 @@ def test_filter_pointwise(df, engine):
 
 def test_filter_non_pointwise(df, engine):
     query = df.filter(pl.col("a") > pl.col("a").max())
-    with pytest.warns(
-        UserWarning, match="This filter is not supported for multiple partitions."
+    with warns_on_spmd(
+        engine,
+        UserWarning,
+        match="This filter is not supported for multiple partitions.",
     ):
         assert_gpu_result_equal(query, engine=engine)
diff --git a/python/cudf_polars/tests/experimental/test_groupby.py b/python/cudf_polars/tests/experimental/test_groupby.py
index 03d87fe23e9..6ca11387da0 100644
--- a/python/cudf_polars/tests/experimental/test_groupby.py
+++ b/python/cudf_polars/tests/experimental/test_groupby.py
@@ -131,8 +131,8 @@ def test_groupby_std_var_ddof(df, engine, agg, ddof):
 
 
 @pytest.mark.parametrize("fallback_mode", ["silent", "raise", "warn", "foo"])
-def test_groupby_fallback(df, fallback_mode, streaming_engine_factory):
-    streaming_engine = streaming_engine_factory(
+def test_groupby_fallback(df, fallback_mode, spmd_engine_factory):
+    streaming_engine = spmd_engine_factory(
         StreamingOptions(fallback_mode=fallback_mode),
     )
     match = "Failed to decompose groupby aggs"
@@ -287,6 +287,10 @@ def test_groupby_count_type_mismatch(df, streaming_engine_factory):
     assert_gpu_result_equal(q, engine=streaming_engine, check_row_order=False)
 
 
+@pytest.mark.skip_on_streaming_engine(
+    "patch.object on ShuffleManager.Inserter doesn't reach worker processes",
+    engine=("dask", "ray"),
+)
 def test_shuffle_reduce_insert_finished_called_on_oom(streaming_engine_factory):
     streaming_engine = streaming_engine_factory(
         StreamingOptions(target_partition_size=10, max_rows_per_partition=5),
diff --git a/python/cudf_polars/tests/experimental/test_io_multirank.py b/python/cudf_polars/tests/experimental/test_io_multirank.py
index 2208cc67316..bf9e8e70343 100644
--- a/python/cudf_polars/tests/experimental/test_io_multirank.py
+++ b/python/cudf_polars/tests/experimental/test_io_multirank.py
@@ -7,16 +7,15 @@
 from typing import TYPE_CHECKING
 
 import pytest
-from rapidsmpf.bootstrap import is_running_with_rrun
 
 import polars as pl
 
-from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
+from cudf_polars.experimental.rapidsmpf.frontend.options import StreamingOptions
 from cudf_polars.testing.asserts import assert_sink_result_equal
 from cudf_polars.utils.config import Cluster, StreamingExecutor
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
+    from collections.abc import Callable
     from pathlib import Path
 
     from cudf_polars.experimental.rapidsmpf.frontend.core import StreamingEngine
@@ -39,43 +38,14 @@ def df() -> pl.LazyFrame:
     )
 
 
-@pytest.fixture(params=["spmd", "ray", "dask"])
+@pytest.fixture
 def engine(
-    request: pytest.FixtureRequest,
-    spmd_engine: SPMDEngine,
-) -> Iterator[StreamingEngine]:
-    """Yield each supported streaming engine."""
-    backend = request.param
-    executor_options = {"max_rows_per_partition": 1_000}
-
-    if backend == "spmd":
-        with SPMDEngine(
-            comm=spmd_engine.comm,
-            executor_options=executor_options,
-        ) as eng:
-            yield eng
-        return
-
-    if is_running_with_rrun():
-        pytest.skip(f"{backend}Engine must not be created from within an rrun cluster")
-
-    if backend == "ray":
-        pytest.importorskip("ray", reason="ray is not installed")
-        from cudf_polars.experimental.rapidsmpf.frontend.ray import RayEngine
-
-        with RayEngine(
-            executor_options=executor_options,
-            ray_init_options={"include_dashboard": False},
-        ) as eng:
-            yield eng
-        return
-
-    assert backend == "dask"
-    pytest.importorskip("distributed", reason="distributed is not installed")
-    from cudf_polars.experimental.rapidsmpf.frontend.dask import DaskEngine
-
-    with DaskEngine(executor_options=executor_options) as eng:
-        yield eng
+    streaming_engine_factory: Callable[..., StreamingEngine],
+) -> StreamingEngine:
+    """Yield each supported streaming engine pinned to small partitions."""
+    return streaming_engine_factory(
+        StreamingOptions(max_rows_per_partition=1_000),
+    )
 
 
 def test_sink_parquet_directory(
diff --git a/python/cudf_polars/tests/experimental/test_join.py b/python/cudf_polars/tests/experimental/test_join.py
index 6a09ff95ef5..1b4635dd924 100644
--- a/python/cudf_polars/tests/experimental/test_join.py
+++ b/python/cudf_polars/tests/experimental/test_join.py
@@ -19,6 +19,7 @@
 from cudf_polars.experimental.shuffle import Shuffle
 from cudf_polars.experimental.statistics import collect_statistics
 from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.engine_utils import warns_on_spmd
 from cudf_polars.utils.config import ConfigOptions, StreamingExecutor
 
 
@@ -103,12 +104,11 @@ def test_join_conditional(reverse, max_rows_per_partition, streaming_engine_fact
     if reverse:
         left, right = right, left
     q = left.join_where(right, pl.col("y") < pl.col("yy"))
-    if max_rows_per_partition == 3:
-        with pytest.warns(
-            UserWarning, match="ConditionalJoin not supported for multiple partitions."
-        ):
-            assert_gpu_result_equal(q, engine=streaming_engine, check_row_order=False)
-    else:
+    with warns_on_spmd(
+        streaming_engine,
+        UserWarning,
+        match="ConditionalJoin not supported for multiple partitions.",
+    ):
         assert_gpu_result_equal(q, engine=streaming_engine, check_row_order=False)
 
 
@@ -156,7 +156,7 @@ def test_join(left, right, how, reverse, streaming_engine_factory, options):
 
 
 @pytest.mark.parametrize("zlice", [(0, 2), (2, 2), (-2, None)])
-def test_join_and_slice(zlice, streaming_engine_factory):
+def test_join_and_slice(request, zlice, streaming_engine_factory):
     streaming_engine = streaming_engine_factory(
         StreamingOptions(
             max_rows_per_partition=3,
@@ -164,6 +164,16 @@ def test_join_and_slice(zlice, streaming_engine_factory):
             fallback_mode="warn",
         ),
     )
+    if streaming_engine.nranks > 1:
+        # The multi-rank fallback for slice doesn't preserve row order
+        # within equal-key groups, so the slice can pick different rows
+        # than the CPU baseline.
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/22405",
+                strict=False,
+            )
+        )
     left = pl.LazyFrame(
         {
             "a": [1, 2, 3, 1, None],
@@ -181,23 +191,22 @@ def test_join_and_slice(zlice, streaming_engine_factory):
     q = left.join(right, on="a", how="inner").slice(*zlice)
     # Check that we get the correct row count
     # See: https://github.com/rapidsai/cudf/issues/19153
-    if zlice in {(2, 2), (-2, None)}:
-        with pytest.warns(
-            UserWarning, match="This slice not supported for multiple partitions."
-        ):
-            assert q.collect(engine=streaming_engine).height == q.collect().height
-    else:
+    with warns_on_spmd(
+        streaming_engine,
+        UserWarning,
+        match="This slice not supported for multiple partitions.",
+        when=zlice in {(2, 2), (-2, None)},
+    ):
         assert q.collect(engine=streaming_engine).height == q.collect().height
 
     # Need sort to match order after a join
     q = left.join(right, on="a", how="inner").sort(pl.col("a")).slice(*zlice)
-    if zlice == (2, 2):
-        with pytest.warns(
-            UserWarning,
-            match="This slice not supported for multiple partitions.",
-        ):
-            assert_gpu_result_equal(q, engine=streaming_engine)
-    else:
+    with warns_on_spmd(
+        streaming_engine,
+        UserWarning,
+        match="This slice not supported for multiple partitions.",
+        when=zlice == (2, 2),
+    ):
         assert_gpu_result_equal(q, engine=streaming_engine)
 
 
@@ -232,7 +241,8 @@ def test_join_maintain_order_fallback_streaming(
     )
     q = left.join(right, on="y", how="inner", maintain_order=maintain_order)
 
-    with pytest.warns(
+    with warns_on_spmd(
+        streaming_engine,
         UserWarning,
         match=r"Join\(maintain_order=.*\) not supported for multiple partitions\.",
     ):
diff --git a/python/cudf_polars/tests/experimental/test_metadata.py b/python/cudf_polars/tests/experimental/test_metadata.py
index 618087a27c5..791e33744cd 100644
--- a/python/cudf_polars/tests/experimental/test_metadata.py
+++ b/python/cudf_polars/tests/experimental/test_metadata.py
@@ -66,20 +66,30 @@ def right() -> pl.LazyFrame:
 def test_rapidsmpf_join_metadata(
     left: pl.LazyFrame,
     right: pl.LazyFrame,
-    streaming_engine_factory,
+    spmd_engine_factory,
     options,
 ) -> None:
-    streaming_engine = streaming_engine_factory(options)
-    config_options = ConfigOptions.from_polars_engine(streaming_engine)
+    # Pinned to SPMD: ``ChannelMetadata.__reduce_cython__`` can't pickle
+    # ``self._handle`` across worker/actor processes, so the
+    # ``metadata_collector`` round-trip fails on Dask and Ray.
+    #
+    # When https://github.com/rapidsai/cudf/pull/22394 lands, dedup of
+    # replicated outputs moves to the Dask/Ray frontends and the
+    # ``duplicated`` flag's semantics change to "every rank holds the
+    # data". Revisit the ``len(metadata_collector) == 1`` and
+    # ``metadata.duplicated is False`` assertions below, and reconsider
+    # whether this test can widen to ``streaming_engine_factory``.
+    engine = spmd_engine_factory(options)
+    config_options = ConfigOptions.from_polars_engine(engine)
     broadcast_join_limit = config_options.executor.broadcast_join_limit
     q = left.join(
         right,
         on="y",
         how="left",
     ).filter(pl.col("x") > pl.col("zz"))
-    ir = Translator(q._ldf.visit(), streaming_engine).translate_ir()
-    left_count = left.collect(engine=streaming_engine).height
-    right_count = right.collect(engine=streaming_engine).height
+    ir = Translator(q._ldf.visit(), engine).translate_ir()
+    left_count = left.collect(engine=engine).height
+    right_count = right.collect(engine=engine).height
 
     metadata_collector = evaluate_logical_plan(
         ir, config_options, collect_metadata=True
diff --git a/python/cudf_polars/tests/experimental/test_parallel.py b/python/cudf_polars/tests/experimental/test_parallel.py
index 67fc372e2e4..a9a0ff63786 100644
--- a/python/cudf_polars/tests/experimental/test_parallel.py
+++ b/python/cudf_polars/tests/experimental/test_parallel.py
@@ -50,10 +50,10 @@ def test_rename_concat(streaming_engine) -> None:
     assert_gpu_result_equal(q, engine=streaming_engine)
 
 
-def test_fallback_on_concat_zlice(streaming_engine_factory) -> None:
+def test_fallback_on_concat_zlice(spmd_engine_factory) -> None:
     # Pin ``fallback_mode="warn"`` so the spmd-small baseline (which sets
     # ``SILENT``) doesn't suppress the warning this test asserts on.
-    streaming_engine = streaming_engine_factory(StreamingOptions(fallback_mode="warn"))
+    streaming_engine = spmd_engine_factory(StreamingOptions(fallback_mode="warn"))
     q = pl.concat(
         [
             pl.LazyFrame({"a": [1, 2]}),
diff --git a/python/cudf_polars/tests/experimental/test_rolling.py b/python/cudf_polars/tests/experimental/test_rolling.py
index 37de6f7f8a1..ee3ae137e27 100644
--- a/python/cudf_polars/tests/experimental/test_rolling.py
+++ b/python/cudf_polars/tests/experimental/test_rolling.py
@@ -8,6 +8,7 @@
 import polars as pl
 
 from cudf_polars.experimental.rapidsmpf.frontend.options import StreamingOptions
+from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 from cudf_polars.utils.versions import POLARS_VERSION_LT_136
 
@@ -46,10 +47,20 @@ def test_rolling_datetime(request, engine):
         assert_gpu_result_equal(q, engine=engine)
 
 
-def test_over_in_filter_unsupported(streaming_engine_factory) -> None:
+def test_over_in_filter_unsupported(request, streaming_engine_factory) -> None:
     engine = streaming_engine_factory(
         StreamingOptions(max_rows_per_partition=1, fallback_mode="warn"),
     )
+    if not isinstance(engine, SPMDEngine):
+        # On Dask/Ray the fallback warning fires on worker processes and is
+        # invisible to ``pytest.warns``; the multi-rank fallback also
+        # doesn't preserve row order.
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/22405",
+                strict=False,
+            )
+        )
     q = pl.concat(
         [
             pl.LazyFrame({"k": ["x", "y"], "v": [3, 2]}),
diff --git a/python/cudf_polars/tests/experimental/test_select.py b/python/cudf_polars/tests/experimental/test_select.py
index 264f8b5aab1..cef9f0f66cf 100644
--- a/python/cudf_polars/tests/experimental/test_select.py
+++ b/python/cudf_polars/tests/experimental/test_select.py
@@ -22,6 +22,7 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
+from cudf_polars.testing.engine_utils import warns_on_spmd
 from cudf_polars.utils.versions import (
     POLARS_VERSION_LT_132,
     POLARS_VERSION_LT_134,
@@ -54,8 +55,8 @@ def test_select(df, engine):
 
 
 @pytest.mark.parametrize("fallback_mode", ["silent", "raise", "warn", "foo"])
-def test_select_reduce_fallback(df, streaming_engine_factory, fallback_mode):
-    engine = streaming_engine_factory(
+def test_select_reduce_fallback(df, spmd_engine_factory, fallback_mode):
+    engine = spmd_engine_factory(
         StreamingOptions(max_rows_per_partition=3, fallback_mode=fallback_mode),
     )
     match = "This selection is not supported for multiple partitions."
@@ -84,13 +85,17 @@ def test_select_reduce_fallback(df, streaming_engine_factory, fallback_mode):
         assert_gpu_result_equal(query, engine=engine)
 
 
-def test_select_fill_null_with_strategy(df, engine):
+def test_select_fill_null_with_strategy(df, streaming_engine_factory):
+    engine = streaming_engine_factory(
+        StreamingOptions(max_rows_per_partition=3, fallback_mode="warn"),
+    )
     q = df.select(pl.col("a").forward_fill())
 
     if POLARS_VERSION_LT_132:
         assert_ir_translation_raises(q, NotImplementedError)
     else:
-        with pytest.warns(
+        with warns_on_spmd(
+            engine,
             UserWarning,
             match="fill_null with strategy other than 'zero' or 'one' is not supported for multiple partitions",
         ):
@@ -183,15 +188,19 @@ def test_select_mean_with_decimals(engine):
     assert_gpu_result_equal(q, engine=engine, check_dtypes=not POLARS_VERSION_LT_134)
 
 
-def test_select_with_len(engine):
-    # https://github.com/pola-rs/polars/issues/25592
+def test_select_with_len(streaming_engine_factory):
+    engine = streaming_engine_factory(
+        StreamingOptions(max_rows_per_partition=3, fallback_mode="warn"),
+    )
     df1 = pl.LazyFrame({"c0": [1] * 4})
     df2 = pl.LazyFrame({"c0": [2] * 4})
     q = pl.concat([df1.join(df2, how="cross"), df1.with_columns(pl.lit(None))]).select(
         pl.len()
     )
-    with pytest.warns(
-        UserWarning, match="Cross join not support for multiple partitions"
+    with warns_on_spmd(
+        engine,
+        UserWarning,
+        match="Cross join not support for multiple partitions",
     ):
         assert_gpu_result_equal(q, engine=engine)
 
diff --git a/python/cudf_polars/tests/experimental/test_spilling.py b/python/cudf_polars/tests/experimental/test_spilling.py
index 6aa11801132..7f79b911038 100644
--- a/python/cudf_polars/tests/experimental/test_spilling.py
+++ b/python/cudf_polars/tests/experimental/test_spilling.py
@@ -50,20 +50,20 @@ def create_test_table(nbytes: int, stream: Stream) -> plc.Table:
     ],
 )
 def test_make_spill_function(
-    streaming_engine_factory,
+    spmd_engine_factory,
     *,
     pinned_memory: bool,
     spilled_host_mem_type: MemoryType,
 ) -> None:
     """Test that spilling prioritizes longest queues and newest messages."""
-    engine = streaming_engine_factory(StreamingOptions(pinned_memory=pinned_memory))
+    engine = spmd_engine_factory(StreamingOptions(pinned_memory=pinned_memory))
     context = engine.context
 
     if spilled_host_mem_type == MemoryType.PINNED_HOST:
-        assert engine.context.br().pinned_mr is not None
+        assert context.br().pinned_mr is not None
         other_host_mem_type = MemoryType.HOST
     else:
-        assert engine.context.br().pinned_mr is None
+        assert context.br().pinned_mr is None
         other_host_mem_type = MemoryType.PINNED_HOST
 
     # Create 3 spillable message containers simulating fanout buffers
diff --git a/python/cudf_polars/tests/experimental/test_statistics.py b/python/cudf_polars/tests/experimental/test_statistics.py
index 82c121d5830..42014a02106 100644
--- a/python/cudf_polars/tests/experimental/test_statistics.py
+++ b/python/cudf_polars/tests/experimental/test_statistics.py
@@ -7,14 +7,12 @@
 from typing import TYPE_CHECKING
 
 import pytest
-from rapidsmpf.bootstrap import is_running_with_rrun
-from rapidsmpf.config import Options
 from rapidsmpf.statistics import Statistics
 
-from cudf_polars.experimental.rapidsmpf.frontend.spmd import SPMDEngine
+from cudf_polars.experimental.rapidsmpf.frontend.options import StreamingOptions
 
 if TYPE_CHECKING:
-    from collections.abc import Iterator
+    from collections.abc import Callable
 
     from cudf_polars.experimental.rapidsmpf.frontend.core import StreamingEngine
 
@@ -25,49 +23,14 @@
 ]
 
 
-@pytest.fixture(params=["spmd", "ray", "dask"])
+@pytest.fixture
 def engine(
-    request: pytest.FixtureRequest,
-    spmd_engine: SPMDEngine,
-) -> Iterator[StreamingEngine]:
+    streaming_engine_factory: Callable[..., StreamingEngine],
+) -> StreamingEngine:
     """Yield each supported streaming engine with statistics enabled."""
-    backend = request.param
-    rapidsmpf_options = Options({"statistics": "True"})
-    executor_options = {"max_rows_per_partition": 10}
-
-    if backend == "spmd":
-        with SPMDEngine(
-            comm=spmd_engine.comm,
-            rapidsmpf_options=rapidsmpf_options,
-            executor_options=executor_options,
-        ) as engine:
-            yield engine
-        return
-
-    if is_running_with_rrun():
-        pytest.skip(f"{backend}Engine must not be created from within an rrun cluster")
-
-    if backend == "ray":
-        pytest.importorskip("ray", reason="ray is not installed")
-        from cudf_polars.experimental.rapidsmpf.frontend.ray import RayEngine
-
-        with RayEngine(
-            rapidsmpf_options=rapidsmpf_options,
-            executor_options=executor_options,
-            ray_init_options={"include_dashboard": False},
-        ) as engine:
-            yield engine
-        return
-
-    assert backend == "dask"
-    pytest.importorskip("distributed", reason="distributed is not installed")
-    from cudf_polars.experimental.rapidsmpf.frontend.dask import DaskEngine
-
-    with DaskEngine(
-        rapidsmpf_options=rapidsmpf_options,
-        executor_options=executor_options,
-    ) as engine:
-        yield engine
+    return streaming_engine_factory(
+        StreamingOptions(statistics=True, max_rows_per_partition=10),
+    )
 
 
 def test_statistics(engine: StreamingEngine) -> None:
diff --git a/python/cudf_polars/tests/experimental/test_unique.py b/python/cudf_polars/tests/experimental/test_unique.py
index 6bb30624cb6..1a157c3fe21 100644
--- a/python/cudf_polars/tests/experimental/test_unique.py
+++ b/python/cudf_polars/tests/experimental/test_unique.py
@@ -10,13 +10,7 @@
 
 from cudf_polars.experimental.rapidsmpf.frontend.options import StreamingOptions
 from cudf_polars.testing.asserts import assert_gpu_result_equal
-
-
-@pytest.fixture
-def engine(streaming_engine_factory):
-    return streaming_engine_factory(
-        StreamingOptions(fallback_mode="warn"),
-    )
+from cudf_polars.testing.engine_utils import warns_on_spmd
 
 
 @pytest.fixture(scope="module")
@@ -77,11 +71,12 @@ def test_unique_head_tail(keep, zlice, streaming_engine_factory):
     )
 
 
-def test_unique_complex_slice_fallback(df, engine):
+def test_unique_complex_slice_fallback(df, streaming_engine_factory):
     """Test that unique with complex slice (offset >= 1) falls back correctly."""
+    engine = streaming_engine_factory(StreamingOptions(fallback_mode="warn"))
     # unique().slice(offset=5, length=10) has zlice[0] >= 1, triggering fallback
     q = df.unique(subset=("y",), keep="any").slice(5, 10)
-    with pytest.warns(UserWarning, match="Complex slice not supported"):
+    with warns_on_spmd(engine, UserWarning, match="Complex slice not supported"):
         result = q.collect(engine=engine)
     # Just verify the fallback produces valid output with expected shape
     assert result.shape == (10, 3)

From 7a120b735f0279221347802d9c02023901a62e78 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 8 May 2026 02:55:05 +0000
Subject: [PATCH 36/36] Address reviews

---
 python/cudf/cudf/core/groupby/groupby.py | 15 +--------------
 1 file changed, 1 insertion(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 153b459f0ba..88fe1f3aca9 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -3070,20 +3070,7 @@ def _to_bool_col(col):
 
         if min_count and min_count > 0:
             counts = self.agg("count")
-            if isinstance(result, Series):
-                count_series = (
-                    counts if isinstance(counts, Series) else counts.iloc[:, 0]
-                )
-                result = result.where(count_series >= min_count, None)
-            else:
-                for col_name in result._column_names:
-                    if col_name not in counts._column_names:
-                        continue
-                    count_col = counts._data[col_name]
-                    mask = count_col < min_count
-                    result[col_name] = result[col_name].where(
-                        ~Series._from_column(mask), None
-                    )
+            result = result.where(counts >= min_count, None)
         return result