diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index c87015eb4e4..5ae1120998c 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -32,7 +32,10 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.core.common import pipe from cudf.core.copy_types import GatherMap -from cudf.core.dtype.validators import is_dtype_obj_numeric +from cudf.core.dtype.validators import ( + is_dtype_obj_numeric, + is_dtype_obj_string, +) from cudf.core.dtypes import ( CategoricalDtype, DecimalDtype, @@ -3059,18 +3062,84 @@ def ewm(self, *args, **kwargs): def any(self, skipna: bool = True, min_count: int = 0, **kwargs: Any): """ Return True if any value in the group is truthful, else False. - - Currently not implemented. """ - raise NotImplementedError("any is currently not implemented") + return self._bool_reduce("any", skipna=skipna, min_count=min_count) def all(self, skipna: bool = True, min_count: int = 0, **kwargs: Any): """ Return True if all values in the group are truthful, else False. - - Currently not implemented. """ - raise NotImplementedError("all is currently not implemented") + return self._bool_reduce("all", skipna=skipna, min_count=min_count) + + def _bool_reduce(self, op: str, *, skipna: bool, min_count: int): + """Implement all/any as min/max on bool-coerced value columns.""" + from cudf.core.dataframe import DataFrame + from cudf.core.series import Series + + agg_name = {"all": "min", "any": "max"}[op] + # Empty-group fill value: vacuously True for all, vacuously False for any + fill_value = op == "all" + + is_series = isinstance(self.obj, Series) + + # Coerce each value column to a (nullable) bool column so that + # nulls are preserved through the aggregation (min/max skip + # nulls). For ``skipna=False``, nulls are replaced with True so + # they don't flip ``all`` to False and always make ``any`` True. + bool_dtype = np.dtype(np.bool_) + + def _to_bool_col(col): + if is_dtype_obj_string(col.dtype): + bool_col = col.count_characters() > np.int8(0) + else: + # For numeric/bool inputs, cast to bool preserving nulls. + bool_col = col != 0 + # Normalize away pandas-extension bool dtypes so the downstream + # aggregation always sees ``np.bool_``. + bool_col = bool_col.astype(bool_dtype, copy=False) + if not skipna: + bool_col = bool_col.fillna(True) + return bool_col + + if is_series: + new_obj = Series._from_column( + _to_bool_col(self.obj._column), name=self.obj.name + ) + else: + new_data = { + col_name: _to_bool_col(self.obj._data[col_name]) + for col_name in self.grouping._values_column_names + } + new_obj = DataFrame._from_data(new_data, index=self.obj.index) + + # Reuse the same grouping so key columns match ``new_obj`` exactly, + # avoiding label-based lookup when the key column was excluded. + bool_gb = type(self)( + new_obj, + by=self.grouping, + level=None, + sort=self._sort, + as_index=self._as_index, + dropna=self._dropna, + ) + result = bool_gb.agg(agg_name) + + # Empty groups (skipna=True with all-NA values) yield NA from + # min/max — pandas treats these as ``True`` for ``all`` and + # ``False`` for ``any``. + bool_np = np.dtype(np.bool_) + if isinstance(result, Series): + result = result.fillna(fill_value).astype(bool_np) + else: + for col_name in result._column_names: + result[col_name] = ( + result[col_name].fillna(fill_value).astype(bool_np) + ) + + if min_count and min_count > 0: + counts = self.agg("count") + result = result.where(counts >= min_count, None) + return result class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin): diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index acf9ce1ec99..1846125b315 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -3119,14 +3119,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_nunique_with_NaT[key0-data0-True-expected0]": "TODO: Add a reason for failure", "tests/groupby/test_reductions.py::test_nunique_with_NaT[key1-data1-True-expected1]": "TODO: Add a reason for failure", "tests/groupby/test_reductions.py::test_nunique_with_timegrouper": "TODO: Add a reason for failure", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-first-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-first-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-first-True-False-0]": "AssertionError: DataFrame are different", @@ -3147,14 +3139,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-first-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-first-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-first-True-False-0]": "AssertionError: DataFrame are different", @@ -3175,14 +3159,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-True-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-first-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-first-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-first-True-False-0]": "AssertionError: DataFrame are different", @@ -3203,14 +3179,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-first-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-first-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-first-True-False-0]": "AssertionError: DataFrame are different", diff --git a/python/cudf/cudf/tests/groupby/test_reductions.py b/python/cudf/cudf/tests/groupby/test_reductions.py index 7a890f99b48..db3210ee270 100644 --- a/python/cudf/cudf/tests/groupby/test_reductions.py +++ b/python/cudf/cudf/tests/groupby/test_reductions.py @@ -1191,6 +1191,51 @@ def test_string_groupby_key_index(): assert_eq(expect, got, check_dtype=False) +@pytest.mark.parametrize("op", ["all", "any"]) +@pytest.mark.parametrize( + "data", + [ + [True, False, True, True, False, False], + [1, 0, 2, 3, 0, 0], + [1.0, 0.0, 2.5, 3.5, 0.0, 0.0], + ], +) +def test_groupby_all_any(op, data): + pdf = pd.DataFrame({"a": [1, 1, 2, 2, 3, 3], "b": data}) + gdf = cudf.from_pandas(pdf) + with cudf.option_context("mode.pandas_compatible", True): + got = getattr(gdf.groupby("a"), op)() + expect = getattr(pdf.groupby("a"), op)() + assert_eq(expect, got) + + +@pytest.mark.parametrize("op", ["all", "any"]) +def test_groupby_all_any_string(op): + pdf = pd.DataFrame( + {"a": [1, 1, 2, 2, 3, 3], "b": ["x", "", "", "", "y", "z"]} + ) + gdf = cudf.from_pandas(pdf) + with cudf.option_context("mode.pandas_compatible", True): + got = getattr(gdf.groupby("a"), op)() + expect = getattr(pdf.groupby("a"), op)() + assert_eq(expect, got) + + +@pytest.mark.parametrize("op", ["all", "any"]) +def test_groupby_all_any_empty(op): + pdf = pd.DataFrame( + { + "a": pd.array([], dtype="int64"), + "b": pd.array([], dtype="bool"), + } + ) + gdf = cudf.from_pandas(pdf) + with cudf.option_context("mode.pandas_compatible", True): + got = getattr(gdf.groupby("a"), op)() + expect = getattr(pdf.groupby("a"), op)() + assert_eq(expect, got, check_index_type=False) + + @pytest.mark.parametrize( "string_dtype", [