Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
b288bbc
Implement groupby ``all``/``any`` via bool-coercion + min/max
galipremsagar May 4, 2026
8992d39
Apply suggestions from code review
galipremsagar May 6, 2026
8285223
Reject pd.NA string-to-object casts in pandas-compatible mode (#22295)
galipremsagar May 4, 2026
ca6dddc
Remove legacy Dask-based streaming backends (#22358)
madsbk May 4, 2026
8995085
Fix StatsCollector.serialize to use value equality instead of object …
Matt711 May 4, 2026
5255d51
Pass managed pool MR explicitly in NDSH parquet data generation (#22344)
vuule May 4, 2026
9407fd6
Fix compile warnings in libcudf examples (#22335)
davidwendt May 4, 2026
0e82b62
Add skip axis to all join benchmarks (#22241)
PointKernel May 4, 2026
c2f583c
Expose __from_arrow__ on masked extension dtype proxies (#22373)
galipremsagar May 5, 2026
5c4c21a
Fix datetime resolution for empty data inputs (#22363)
galipremsagar May 5, 2026
31732df
Expose additional proxy attributes for IntervalArray, Styler, and _Me…
galipremsagar May 5, 2026
4aa57e5
Multi-rank fixes for cudf-polars streaming (#22361)
madsbk May 5, 2026
aa23377
Fix reading of large CSV files (>64MB) (#22375)
vuule May 5, 2026
4aa13f1
Add decimal128 to groupby_max_cardinality benchmark (#22162)
PointKernel May 5, 2026
c5cb03b
`StreamingEngine._reset()` (#22364)
madsbk May 5, 2026
8bdabe7
Validate PDS-DS Q1 (#22389)
Matt711 May 6, 2026
e304ffd
Improve hstack lowering (#22353)
rjzamora May 6, 2026
9edc7dc
Replace `LD_PRELOAD` hack with compute-sanitizer (#22290)
KyleFromNVIDIA May 6, 2026
3700502
Run all nvbench benchmarks with timeout in smoketest (#20538)
bdice May 6, 2026
f0b2a42
Fix a crash in the ORC reader with malformed stripe footers (#22383)
vuule May 6, 2026
df9ea24
Rename build/probe to right/left in hash_join and distinct_hash_join …
PointKernel May 6, 2026
d9195b6
remove pylibcudf calls
galipremsagar May 6, 2026
c84f036
Merge branch 'pandas3' into groupby_bool_reduce
galipremsagar May 6, 2026
7d7bd35
Merge branch 'pandas3' into groupby_bool_reduce
galipremsagar May 7, 2026
e9dd32b
Update python/cudf/cudf/core/groupby/groupby.py
galipremsagar May 8, 2026
e03db07
Correctly handle blocks with "block byte size" fields in the Avro rea…
vuule May 6, 2026
62c8c5a
Use `token.rapids.nvidia.com` when issuing S3 bucket creds in devcont…
trxcllnt May 6, 2026
6ffe708
Use static cudart by default (#22397)
KyleFromNVIDIA May 6, 2026
6598b63
Fix `to_array` to return non-corrupted data (#22342)
galipremsagar May 6, 2026
aa0a707
Use cudaStream_t instead of cuda_stream_view in pylibcudf Cython (#22…
vyasr May 6, 2026
b45c5aa
Use `language: script` for cudf-polars-ir-signatures pre-commit hook …
vyasr May 6, 2026
8a0d5f9
Fix potential errors in Parquet page header decode (#22274)
mhaseeb123 May 7, 2026
be40780
Make RapidsMPF the default runtime for cudf_polars streaming executor…
mroeschke May 7, 2026
f49d5e8
Use thread pool to submit hybrid scan host IO tasks (#21992)
mhaseeb123 May 7, 2026
50cee5b
Python bindings and pytests for `cudf::apply_deletion_mask` (#22145)
mhaseeb123 May 7, 2026
47b699d
Refactor ``sort_actor`` to prepare for ``OrderScheme`` changes (#22350)
rjzamora May 7, 2026
996eb35
Run the cudf-polars test suite against `DaskEngine` and `RayEngine` (…
madsbk May 7, 2026
7a120b7
Address reviews
galipremsagar May 8, 2026
4dcb025
Merge branch 'pandas3' into groupby_bool_reduce
galipremsagar May 8, 2026
48c4ccd
Merge
galipremsagar May 12, 2026
3d9f864
Merge
galipremsagar May 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 76 additions & 7 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,10 @@
from cudf.core.column_accessor import ColumnAccessor
from cudf.core.common import pipe
from cudf.core.copy_types import GatherMap
from cudf.core.dtype.validators import is_dtype_obj_numeric
from cudf.core.dtype.validators import (
is_dtype_obj_numeric,
is_dtype_obj_string,
)
from cudf.core.dtypes import (
CategoricalDtype,
DecimalDtype,
Expand Down Expand Up @@ -3059,18 +3062,84 @@ def ewm(self, *args, **kwargs):
def any(self, skipna: bool = True, min_count: int = 0, **kwargs: Any):
"""
Return True if any value in the group is truthful, else False.

Currently not implemented.
"""
raise NotImplementedError("any is currently not implemented")
return self._bool_reduce("any", skipna=skipna, min_count=min_count)

def all(self, skipna: bool = True, min_count: int = 0, **kwargs: Any):
"""
Return True if all values in the group are truthful, else False.

Currently not implemented.
"""
raise NotImplementedError("all is currently not implemented")
return self._bool_reduce("all", skipna=skipna, min_count=min_count)

def _bool_reduce(self, op: str, *, skipna: bool, min_count: int):
"""Implement all/any as min/max on bool-coerced value columns."""
from cudf.core.dataframe import DataFrame
from cudf.core.series import Series

agg_name = {"all": "min", "any": "max"}[op]
# Empty-group fill value: vacuously True for all, vacuously False for any
fill_value = op == "all"

is_series = isinstance(self.obj, Series)

# Coerce each value column to a (nullable) bool column so that
# nulls are preserved through the aggregation (min/max skip
# nulls). For ``skipna=False``, nulls are replaced with True so
# they don't flip ``all`` to False and always make ``any`` True.
bool_dtype = np.dtype(np.bool_)

def _to_bool_col(col):
if is_dtype_obj_string(col.dtype):
bool_col = col.count_characters() > np.int8(0)
else:
# For numeric/bool inputs, cast to bool preserving nulls.
bool_col = col != 0
# Normalize away pandas-extension bool dtypes so the downstream
# aggregation always sees ``np.bool_``.
bool_col = bool_col.astype(bool_dtype, copy=False)
if not skipna:
bool_col = bool_col.fillna(True)
return bool_col

if is_series:
new_obj = Series._from_column(
_to_bool_col(self.obj._column), name=self.obj.name
)
else:
new_data = {
col_name: _to_bool_col(self.obj._data[col_name])
for col_name in self.grouping._values_column_names
}
new_obj = DataFrame._from_data(new_data, index=self.obj.index)

# Reuse the same grouping so key columns match ``new_obj`` exactly,
# avoiding label-based lookup when the key column was excluded.
bool_gb = type(self)(
new_obj,
by=self.grouping,
level=None,
sort=self._sort,
as_index=self._as_index,
dropna=self._dropna,
)
result = bool_gb.agg(agg_name)

# Empty groups (skipna=True with all-NA values) yield NA from
# min/max — pandas treats these as ``True`` for ``all`` and
# ``False`` for ``any``.
bool_np = np.dtype(np.bool_)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just confirming, is np.dtype(np.bool_) return regardless of the pandas string type?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes:

In [9]: df = pd.DataFrame({
   ...:       "k": [1, 1, 2, 2],
   ...:       "s": pd.array(["a", "b", pd.NA, "c"], dtype="string"),
   ...:   })

In [10]: df
Out[10]: 
   k     s
0  1     a
1  1     b
2  2  <NA>
3  2     c

In [11]: df.groupby("k").all()
Out[11]: 
      s
k      
1  True
2  True

In [12]: df.groupby("k").all().dtypes
Out[12]: 
s    bool
dtype: object

if isinstance(result, Series):
result = result.fillna(fill_value).astype(bool_np)
else:
for col_name in result._column_names:
result[col_name] = (
result[col_name].fillna(fill_value).astype(bool_np)
)

if min_count and min_count > 0:
counts = self.agg("count")
result = result.where(counts >= min_count, None)
return result


class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
Expand Down
32 changes: 0 additions & 32 deletions python/cudf/cudf/pandas/scripts/conftest-patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3119,14 +3119,6 @@ def pytest_unconfigure(config):
"tests/groupby/test_reductions.py::test_nunique_with_NaT[key0-data0-True-expected0]": "TODO: Add a reason for failure",
"tests/groupby/test_reductions.py::test_nunique_with_NaT[key1-data1-True-expected1]": "TODO: Add a reason for failure",
"tests/groupby/test_reductions.py::test_nunique_with_timegrouper": "TODO: Add a reason for failure",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-first-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-first-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-first-True-False-0]": "AssertionError: DataFrame are different",
Expand All @@ -3147,14 +3139,6 @@ def pytest_unconfigure(config):
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-first-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-first-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-first-True-False-0]": "AssertionError: DataFrame are different",
Expand All @@ -3175,14 +3159,6 @@ def pytest_unconfigure(config):
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-first-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-first-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-first-True-False-0]": "AssertionError: DataFrame are different",
Expand All @@ -3203,14 +3179,6 @@ def pytest_unconfigure(config):
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-first-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-first-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-first-True-False-0]": "AssertionError: DataFrame are different",
Expand Down
45 changes: 45 additions & 0 deletions python/cudf/cudf/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -1191,6 +1191,51 @@ def test_string_groupby_key_index():
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize("op", ["all", "any"])
@pytest.mark.parametrize(
"data",
[
[True, False, True, True, False, False],
[1, 0, 2, 3, 0, 0],
[1.0, 0.0, 2.5, 3.5, 0.0, 0.0],
],
)
def test_groupby_all_any(op, data):
pdf = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": data})
gdf = cudf.from_pandas(pdf)
with cudf.option_context("mode.pandas_compatible", True):
got = getattr(gdf.groupby("a"), op)()
expect = getattr(pdf.groupby("a"), op)()
assert_eq(expect, got)


@pytest.mark.parametrize("op", ["all", "any"])
def test_groupby_all_any_string(op):
pdf = pd.DataFrame(
{"a": [1, 1, 2, 2, 3, 3], "b": ["x", "", "", "", "y", "z"]}
)
gdf = cudf.from_pandas(pdf)
with cudf.option_context("mode.pandas_compatible", True):
got = getattr(gdf.groupby("a"), op)()
expect = getattr(pdf.groupby("a"), op)()
assert_eq(expect, got)


@pytest.mark.parametrize("op", ["all", "any"])
def test_groupby_all_any_empty(op):
pdf = pd.DataFrame(
{
"a": pd.array([], dtype="int64"),
"b": pd.array([], dtype="bool"),
}
)
gdf = cudf.from_pandas(pdf)
with cudf.option_context("mode.pandas_compatible", True):
got = getattr(gdf.groupby("a"), op)()
expect = getattr(pdf.groupby("a"), op)()
assert_eq(expect, got, check_index_type=False)


@pytest.mark.parametrize(
"string_dtype",
[
Expand Down
Loading