diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 239352bd99b..af84df22433 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -19,7 +19,7 @@ import pylibcudf as plc from cudf.api.types import is_list_like, is_scalar -from cudf.core._internals import aggregation, sorting +from cudf.core._internals import aggregation, binaryop, sorting from cudf.core.abc import Serializable from cudf.core.column import access_columns from cudf.core.column.column import ( @@ -1249,6 +1249,12 @@ def _reduce( """ if numeric_only: return self._reduce_numeric_only(op) + + if op == "sum" and self._has_string_value_column(): + return self._string_sum( + skipna=kwargs.get("skipna", True), min_count=min_count + ) + result = self.agg(op) if min_count and min_count > 0: counts = self.agg("count") @@ -1259,6 +1265,140 @@ def _scan(self, op: str, *args, **kwargs): """{op_name} for each group.""" return self.agg(op) + def _has_string_value_column(self) -> bool: + from cudf.core.series import Series + + if isinstance(self.obj, Series): + return isinstance(self.obj.dtype, pd.StringDtype) + for col_name in self.grouping._values_column_names: + if isinstance(self.obj._data[col_name].dtype, pd.StringDtype): + return True + return False + + def _string_sum(self, *, skipna: bool, min_count: int): + """Implement groupby sum for StringDtype columns as per-group + string concatenation. + """ + from cudf.core.column import ColumnBase + from cudf.core.dataframe import DataFrame + from cudf.core.series import Series + + is_series = isinstance(self.obj, Series) + + def _concat_column(string_col): + # Group into a list column using collect_list and then + # join the list elements per group. + requests = [ + plc.groupby.GroupByRequest( + string_col.plc_column, + [plc.aggregation.collect_list()], + ) + ] + with access_columns(string_col, mode="read", scope="internal"): + with self._groupby_manager as plc_groupby: + keys, results = plc_groupby.aggregate(requests) + list_col = results[0].columns()[0] + sep = plc.Scalar.from_py("") + sep_narep = plc.Scalar.from_py("") + if skipna: + string_narep = plc.Scalar.from_py("") + empty_policy = ( + plc.strings.combine.OutputIfEmptyList.EMPTY_STRING + ) + else: + string_narep = plc.Scalar.from_py( + None, plc.DataType(plc.TypeId.STRING) + ) + empty_policy = ( + plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT + ) + joined = plc.strings.combine.join_list_elements( + list_col, + sep, + sep_narep, + string_narep, + plc.strings.combine.SeparatorOnNulls.YES, + empty_policy, + ) + return ColumnBase.create(joined, string_col.dtype), keys + + def _apply_min_count(result_col, string_col, keys): + if min_count <= 0: + return result_col + count_req = [ + plc.groupby.GroupByRequest( + string_col.plc_column, [plc.aggregation.count()] + ) + ] + with access_columns(string_col, mode="read", scope="internal"): + with self._groupby_manager as plc_groupby: + _, count_results = plc_groupby.aggregate(count_req) + count_plc = count_results[0].columns()[0] + count_col = ColumnBase.create( + count_plc, dtype_from_pylibcudf_column(count_plc) + ) + keep_mask = binaryop.binaryop( + count_col, + plc.Scalar.from_py(min_count), + "__ge__", + np.dtype(np.bool_), + ) + null_str = plc.Scalar.from_py( + None, plc.DataType(plc.TypeId.STRING) + ) + return result_col.copy_if_else(null_str, keep_mask) + + # Key index is shared across all aggregations. + key_dtypes = [col.dtype for col in self.grouping._key_columns] + keys_cache = None + + def _group_and_join(string_col): + nonlocal keys_cache + result_col, keys = _concat_column(string_col) + if keys_cache is None: + keys_cache = keys + result_col = _apply_min_count(result_col, string_col, keys) + return result_col + + if is_series: + string_col = self.obj._column + out_col = _group_and_join(string_col) + assert keys_cache is not None + index = self.grouping.keys._from_columns_like_self( + [ + ColumnBase.create(key, dtype) + for key, dtype in zip( + keys_cache.columns(), key_dtypes, strict=True + ) + ] + ) + return Series._from_column( + out_col, name=self.obj.name, index=index + ) + else: + out_data = {} + for col_name in self.grouping._values_column_names: + col = self.obj._data[col_name] + if isinstance(col.dtype, pd.StringDtype): + out_data[col_name] = _group_and_join(col) + else: + # Non-string columns go through normal agg path. + # TODO: handle mixed dtype frames + raise NotImplementedError( + "sum on mixed string and non-string columns is " + "not yet supported" + ) + assert keys_cache is not None + index = self.grouping.keys._from_columns_like_self( + [ + ColumnBase.create(key, dtype) + for key, dtype in zip( + keys_cache.columns(), key_dtypes, strict=True + ) + ] + ) + return DataFrame._from_data(out_data, index=index) + aggregate = agg def _head_tail(self, n, *, take_head: bool, preserve_order: bool): diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 9b2c16d2e20..06125df60f5 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -3605,10 +3605,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-True-True-0]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-True-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-0]": "AssertionError: DataFrame are different", @@ -3653,10 +3649,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-True-True-0]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-True-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-0]": "AssertionError: DataFrame are different", @@ -3709,10 +3701,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-size-False-True-1]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-size-True-True-0]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-size-True-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-0]": "AssertionError: DataFrame are different", @@ -3761,10 +3749,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-True-True-0]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-True-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-sum-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-sum-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-sum-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-sum-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_sum_skipna[False-values0-float64]": "AssertionError: Series are different", "tests/groupby/test_reductions.py::test_sum_skipna[False-values3-timedelta64[ns]]": "AssertionError: Series are different", "tests/groupby/test_reductions.py::test_sum_skipna_object[False]": "AssertionError: Series are different", diff --git a/python/cudf/cudf/tests/groupby/test_reductions.py b/python/cudf/cudf/tests/groupby/test_reductions.py index aa82f6ae026..ecb89b052a2 100644 --- a/python/cudf/cudf/tests/groupby/test_reductions.py +++ b/python/cudf/cudf/tests/groupby/test_reductions.py @@ -911,8 +911,14 @@ def test_group_by_empty_reduction( ) request.applymarker( pytest.mark.xfail( - condition=all_supported_types_as_str in {"str", "category"} - and groupby_reduction_methods in {"sum", "prod", "mean"}, + condition=( + all_supported_types_as_str in {"str", "category"} + and groupby_reduction_methods in {"prod", "mean"} + ) + or ( + all_supported_types_as_str == "category" + and groupby_reduction_methods == "sum" + ), raises=TypeError, reason=f"{all_supported_types_as_str} raises TypeError with {groupby_reduction_methods}", ) @@ -1191,6 +1197,29 @@ def test_string_groupby_key_index(): assert_eq(expect, got, check_dtype=False) +@pytest.mark.parametrize( + "string_dtype", + [ + pd.StringDtype(storage="python", na_value=pd.NA), + pd.StringDtype(storage="python", na_value=np.nan), + pd.StringDtype(storage="pyarrow", na_value=pd.NA), + pd.StringDtype(storage="pyarrow", na_value=np.nan), + ], +) +def test_groupby_string_sum(string_dtype): + pdf = pd.DataFrame( + { + "a": [1, 1, 2, 2, 3], + "b": pd.array(["x", "y", "", "z", "q"], dtype=string_dtype), + } + ) + gdf = cudf.from_pandas(pdf) + with cudf.option_context("mode.pandas_compatible", True): + got = gdf.groupby("a").sum() + expect = pdf.groupby("a").sum() + assert_eq(expect, got) + + @pytest.mark.parametrize("op", ["sum", "min", "max", "first", "last"]) @pytest.mark.parametrize("min_count", [0, 1, 2, 3, 5]) def test_groupby_reduce_min_count(op, min_count):