diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 239352bd99b..52d613d9cce 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -449,6 +449,35 @@ def __exit__(self, exc_type, exc_val, exc_tb): return False +def _collect_series_key_column_names(obj, by) -> list[Hashable]: + """Identify, for each Series grouping key in ``by``, the name of the + corresponding column in ``obj`` whose underlying column object is + identical to the Series' column. Returns a list (one entry per Series + key, in order) of column names or ``None``. Non-Series keys produce no + entry. The check uses object identity to mirror pandas' behavior of + excluding such columns from aggregation values. + + Only applies when ``obj`` is a DataFrame: for Series inputs, the single + column *is* the value column, so identity-based exclusion would empty + the aggregation result. + """ + import cudf + + result: list[Hashable] = [] + if not isinstance(obj, cudf.DataFrame): + return result + by_list = by if isinstance(by, list) else [by] + for key in by_list: + if isinstance(key, cudf.Series): + matched = None + for col_name, col in obj._column_labels_and_values: + if col is key._column: + matched = col_name + break + result.append(matched) + return result + + class GroupBy(Serializable, Reducible, Scannable): obj: Series | DataFrame @@ -523,6 +552,11 @@ def __init__( dropna : bool, optional If True (default), do not include the "null" group. """ + # Determine which column names in `obj` correspond to the grouping + # key Series by column identity (mirrors pandas' behavior). + # Must be done before ``nans_to_nulls`` which breaks identity. + by_series_col_names = _collect_series_key_column_names(obj, by) + if get_option("mode.pandas_compatible"): obj = obj.nans_to_nulls() self.obj = obj @@ -537,7 +571,9 @@ def __init__( self._by._obj = self.obj self.grouping = self._by else: - self.grouping = _Grouping(obj, self._by, level) + self.grouping = _Grouping( + obj, self._by, level, by_series_col_names + ) self._groupby_manager = _GroupByContextManager( self.grouping, self._dropna @@ -702,7 +738,8 @@ def size(self) -> Series: .groupby(self.grouping, sort=self._sort, dropna=self._dropna) .agg("size") ) - if isinstance(getattr(self.obj, "dtype", None), pd.ArrowDtype): + obj_dtype = getattr(self.obj, "dtype", None) + if isinstance(obj_dtype, pd.ArrowDtype): # TODO: Remove once groupby.agg preserves pandas extension dtypes. arrow_dtype = pd.ArrowDtype(pa.int64()) if isinstance(result, Series): @@ -713,6 +750,23 @@ def size(self) -> Series: result._data["size"] = ColumnBase.create( result._data["size"].plc_column, arrow_dtype ) + elif ( + isinstance(obj_dtype, pd.StringDtype) + and obj_dtype.storage == "pyarrow" + and obj_dtype.na_value is pd.NA + ): + # Series.groupby.size() on ``string[pyarrow]`` returns Int64. + int64_dtype = pd.Int64Dtype() + if isinstance(result, Series): + result = Series._from_column( + ColumnBase.create(result._column.plc_column, int64_dtype), + name=result.name, + index=result.index, + ) + elif "size" in result._column_names: + result._data["size"] = ColumnBase.create( + result._data["size"].plc_column, int64_dtype + ) if not self._as_index: result = result.rename("size").reset_index() return result @@ -1083,9 +1137,14 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): ) # Override for specific aggregation types that need dtype adjustments if agg_kind in {"COUNT", "SIZE", "ARGMIN", "ARGMAX"}: - cast_dtype = get_dtype_of_same_kind( - orig_dtype, np.dtype(np.int64) - ) + if isinstance(orig_dtype, pd.StringDtype): + cast_dtype = np.dtype(np.int64) + else: + cast_dtype = get_dtype_of_same_kind( + orig_dtype, np.dtype(np.int64) + ) + elif agg_kind == "NUNIQUE": + cast_dtype = np.dtype(np.int64) elif ( ( isinstance(agg_name, str) @@ -3464,7 +3523,7 @@ def __init__( class _Grouping(Serializable): - def __init__(self, obj, by=None, level=None): + def __init__(self, obj, by=None, level=None, series_key_column_names=None): self._obj = obj self._key_columns = [] self.names = [] @@ -3472,6 +3531,10 @@ def __init__(self, obj, by=None, level=None): # Need to keep track of named key columns # to support `as_index=False` correctly self._named_columns = [] + # For each Series-typed grouping key (in order), the name of the + # ``obj`` column that the Series' underlying column is identical + # to (or ``None`` if the Series is unrelated to any column). + self._series_key_column_names = list(series_key_column_names or []) self._handle_by_or_level(by, level) if len(obj) and not len(self._key_columns): @@ -3553,6 +3616,13 @@ def _handle_series(self, by): by = by._align_to_index(self._obj.index, how="right") self._key_columns.append(by._column) self.names.append(by.name) + # Mirror pandas: if the grouping Series' underlying column was one + # of the obj's columns (identity checked before any transformation), + # exclude that column name from value columns during aggregation. + if self._series_key_column_names: + col_name = self._series_key_column_names.pop(0) + if col_name is not None: + self._named_columns.append(col_name) def _handle_index(self, by): self._key_columns.extend(by._columns) diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 9b2c16d2e20..f91682d1e28 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -800,7 +800,6 @@ def pytest_unconfigure(config): "tests/copy_view/test_methods.py::test_frame_set_axis": "TODO: Add a reason for failure", "tests/copy_view/test_methods.py::test_get[a]": "TODO: Add a reason for failure", "tests/copy_view/test_methods.py::test_get[key1]": "TODO: Add a reason for failure", - "tests/copy_view/test_methods.py::test_groupby_column_index_in_references": "TODO: Add a reason for failure", "tests/copy_view/test_methods.py::test_infer_objects": "TODO: Add a reason for failure", "tests/copy_view/test_methods.py::test_infer_objects_no_reference": "AssertionError: assert False", "tests/copy_view/test_methods.py::test_insert_series": "TODO: Add a reason for failure", @@ -3054,7 +3053,6 @@ def pytest_unconfigure(config): "tests/groupby/methods/test_size.py::test_size_sort[False-A]": "TODO: Add a reason for failure", "tests/groupby/methods/test_size.py::test_size_sort[False-B]": "TODO: Add a reason for failure", "tests/groupby/methods/test_size.py::test_size_sort[False-by2]": "TODO: Add a reason for failure", - "tests/groupby/methods/test_size.py::test_size_strings[string=string[pyarrow]]": "TODO: Add a reason for failure", "tests/groupby/methods/test_value_counts.py::test_against_frame_and_seriesgroupby[False-False-False-None-False-count-array]": "TODO: Add a reason for failure", "tests/groupby/methods/test_value_counts.py::test_against_frame_and_seriesgroupby[False-False-False-None-True-proportion-array]": "TODO: Add a reason for failure", "tests/groupby/methods/test_value_counts.py::test_against_frame_and_seriesgroupby[False-False-True-False-False-count-array]": "TODO: Add a reason for failure", @@ -3110,8 +3108,6 @@ def pytest_unconfigure(config): "tests/groupby/test_all_methods.py::test_duplicate_columns[cummin-False]": "TODO: Add a reason for failure", "tests/groupby/test_all_methods.py::test_duplicate_columns[cumprod-False]": "TODO: Add a reason for failure", "tests/groupby/test_all_methods.py::test_duplicate_columns[cumsum-False]": "TODO: Add a reason for failure", - "tests/groupby/test_all_methods.py::test_duplicate_columns[nunique-False]": "TODO: Add a reason for failure", - "tests/groupby/test_all_methods.py::test_duplicate_columns[nunique-True]": "TODO: Add a reason for failure", "tests/groupby/test_all_methods.py::test_duplicate_columns[rank-False]": "TODO: Add a reason for failure", "tests/groupby/test_all_methods.py::test_not_c_contiguous_mask[all]": "assert not True", "tests/groupby/test_all_methods.py::test_not_c_contiguous_mask[any]": "assert not True", @@ -3203,7 +3199,6 @@ def pytest_unconfigure(config): "tests/groupby/test_categorical.py::test_agg_list[False-True-median-keys0-True]": "TODO: Add a reason for failure", "tests/groupby/test_categorical.py::test_agg_list[False-True-min-keys0-True]": "TODO: Add a reason for failure", "tests/groupby/test_categorical.py::test_agg_list[False-True-nunique-keys0-True]": "TODO: Add a reason for failure", - "tests/groupby/test_categorical.py::test_agg_list[False-True-nunique-keys1-True]": "TODO: Add a reason for failure", "tests/groupby/test_categorical.py::test_agg_list[False-True-prod-keys0-True]": "TODO: Add a reason for failure", "tests/groupby/test_categorical.py::test_agg_list[False-True-size-keys0-True]": "TODO: Add a reason for failure", "tests/groupby/test_categorical.py::test_agg_list[False-True-std-keys0-True]": "TODO: Add a reason for failure", @@ -3214,7 +3209,6 @@ def pytest_unconfigure(config): "tests/groupby/test_categorical.py::test_category_order_apply[False-True-True-False-apply-range]": "TODO: Add a reason for failure", "tests/groupby/test_categorical.py::test_category_order_apply[False-True-True-True-apply-range]": "TODO: Add a reason for failure", "tests/groupby/test_categorical.py::test_describe_categorical_columns": "TODO: Add a reason for failure", - "tests/groupby/test_categorical.py::test_groupby_categorical_observed_nunique": "TODO: Add a reason for failure", "tests/groupby/test_categorical.py::test_observed[False]": "TODO: Add a reason for failure", "tests/groupby/test_categorical.py::test_observed_codes_remap[False]": "TODO: Add a reason for failure", "tests/groupby/test_categorical.py::test_observed_two_columns[False]": "AssertionError: DataFrame.index level [0] are different", @@ -3263,13 +3257,11 @@ def pytest_unconfigure(config): "tests/groupby/test_groupby.py::test_groupby_multiindex_not_lexsorted[True]": "AssertionError: assert not True", "tests/groupby/test_groupby.py::test_groupby_nat_exclude": "AssertionError: numpy array are different", "tests/groupby/test_groupby.py::test_groupby_nonobject_dtype": "TODO: Add a reason for failure", - "tests/groupby/test_groupby.py::test_groupby_nonstring_columns": "TODO: Add a reason for failure", "tests/groupby/test_groupby.py::test_groupby_numerical_stability_cumsum": "TODO: Add a reason for failure", "tests/groupby/test_groupby.py::test_groupby_numerical_stability_sum_mean[mean-values1]": "TODO: Add a reason for failure", "tests/groupby/test_groupby.py::test_groupby_numerical_stability_sum_mean[sum-values0]": "TODO: Add a reason for failure", "tests/groupby/test_groupby.py::test_groupby_one_row": "TODO: Add a reason for failure", "tests/groupby/test_groupby.py::test_groupby_overflow[222-uint]": "TODO: Add a reason for failure", - "tests/groupby/test_groupby.py::test_groupby_series_with_name": "TODO: Add a reason for failure", "tests/groupby/test_groupby.py::test_groupby_sum_support_mask[Int16-prod-2]": "TODO: Add a reason for failure", "tests/groupby/test_groupby.py::test_groupby_sum_support_mask[Int16-sum-3]": "TODO: Add a reason for failure", "tests/groupby/test_groupby.py::test_groupby_sum_support_mask[Int32-prod-2]": "TODO: Add a reason for failure", @@ -3564,10 +3556,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_nunique": "TODO: Add a reason for failure", "tests/groupby/test_reductions.py::test_nunique_with_NaT[key0-data0-True-expected0]": "TODO: Add a reason for failure", "tests/groupby/test_reductions.py::test_nunique_with_NaT[key1-data1-True-expected1]": "TODO: Add a reason for failure", - "tests/groupby/test_reductions.py::test_nunique_with_NaT[key2-data2-False-expected2]": "TODO: Add a reason for failure", - "tests/groupby/test_reductions.py::test_nunique_with_NaT[key3-data3-False-expected3]": "TODO: Add a reason for failure", - "tests/groupby/test_reductions.py::test_nunique_with_empty_series": "TODO: Add a reason for failure", - "tests/groupby/test_reductions.py::test_nunique_with_object": "TODO: Add a reason for failure", "tests/groupby/test_reductions.py::test_nunique_with_timegrouper": "TODO: Add a reason for failure", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-all-False-False-1]": "AssertionError: DataFrame are different", @@ -3577,10 +3565,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-True-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-any-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-count-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-count-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-count-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-count-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-first-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-first-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-first-True-False-0]": "AssertionError: DataFrame are different", @@ -3597,14 +3581,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-min-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-min-True-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-min-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-False-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-False-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-True-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-True-True-1]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different", @@ -3617,10 +3593,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-True-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-any-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-count-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-count-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-count-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-count-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-first-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-first-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-first-False-True-0]": "AssertionError: Attributes of Series are different", @@ -3645,14 +3617,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-min-True-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-min-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-min-True-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-False-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-False-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-True-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-True-True-1]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-True-False-0]": "AssertionError: DataFrame are different", @@ -3665,14 +3629,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-any-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-count-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-count-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-count-False-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-count-False-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-count-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-count-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-count-True-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-count-True-True-1]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-first-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-first-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-first-False-True-0]": "AssertionError: Attributes of Series are different", @@ -3697,18 +3653,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-min-True-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-min-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-min-True-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-nunique-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-nunique-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-nunique-False-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-nunique-False-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-nunique-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-nunique-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-nunique-True-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-nunique-True-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-size-False-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-size-False-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-size-True-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-size-True-True-1]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different", @@ -3721,14 +3665,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-any-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-count-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-count-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-count-False-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-count-False-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-count-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-count-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-count-True-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-count-True-True-1]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-first-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-first-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-first-False-True-0]": "AssertionError: Attributes of Series are different", @@ -3753,14 +3689,6 @@ def pytest_unconfigure(config): "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-min-True-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-min-True-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-min-True-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-False-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-False-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-False-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-False-True-1]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-True-False-0]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-True-False-1]": "AssertionError: DataFrame are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-True-True-0]": "AssertionError: Attributes of Series are different", - "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-True-True-1]": "AssertionError: Attributes of Series are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-sum-False-False-0]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-sum-False-False-1]": "AssertionError: DataFrame are different", "tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-sum-True-False-0]": "AssertionError: DataFrame are different", @@ -3782,8 +3710,6 @@ def pytest_unconfigure(config): "tests/groupby/transform/test_transform.py::test_as_index_no_change[cumprod-keys1]": "TODO: Add a reason for failure", "tests/groupby/transform/test_transform.py::test_as_index_no_change[cumsum-A]": "TODO: Add a reason for failure", "tests/groupby/transform/test_transform.py::test_as_index_no_change[cumsum-keys1]": "TODO: Add a reason for failure", - "tests/groupby/transform/test_transform.py::test_as_index_no_change[nunique-A]": "TODO: Add a reason for failure", - "tests/groupby/transform/test_transform.py::test_as_index_no_change[nunique-keys1]": "TODO: Add a reason for failure", "tests/groupby/transform/test_transform.py::test_as_index_no_change[size-A]": "TODO: Add a reason for failure", "tests/groupby/transform/test_transform.py::test_as_index_no_change[size-keys1]": "TODO: Add a reason for failure", "tests/groupby/transform/test_transform.py::test_cython_transform_series[cumprod-args0-]": "TODO: Add a reason for failure", @@ -3819,7 +3745,6 @@ def pytest_unconfigure(config): "tests/groupby/transform/test_transform.py::test_nan_in_cumsum_group_label": "AssertionError: Attributes of Series are different", "tests/groupby/transform/test_transform.py::test_null_group_str_reducer[False-idxmax]": "TODO: Add a reason for failure", "tests/groupby/transform/test_transform.py::test_null_group_str_reducer[False-idxmin]": "TODO: Add a reason for failure", - "tests/groupby/transform/test_transform.py::test_null_group_str_reducer[False-nunique]": "TODO: Add a reason for failure", "tests/groupby/transform/test_transform.py::test_null_group_str_reducer[False-size]": "TODO: Add a reason for failure", "tests/groupby/transform/test_transform.py::test_null_group_str_reducer[True-idxmax]": "TODO: Add a reason for failure", "tests/groupby/transform/test_transform.py::test_null_group_str_reducer[True-idxmin]": "TODO: Add a reason for failure", diff --git a/python/cudf/cudf/tests/groupby/test_reductions.py b/python/cudf/cudf/tests/groupby/test_reductions.py index aa82f6ae026..46ed3eb3fa8 100644 --- a/python/cudf/cudf/tests/groupby/test_reductions.py +++ b/python/cudf/cudf/tests/groupby/test_reductions.py @@ -1191,6 +1191,61 @@ def test_string_groupby_key_index(): assert_eq(expect, got, check_dtype=False) +@pytest.mark.parametrize( + "string_dtype", + [ + pd.StringDtype(storage="python", na_value=pd.NA), + pd.StringDtype(storage="python", na_value=np.nan), + pd.StringDtype(storage="pyarrow", na_value=pd.NA), + pd.StringDtype(storage="pyarrow", na_value=np.nan), + ], +) +@pytest.mark.parametrize("op", ["count", "nunique", "size"]) +def test_groupby_string_int_returning_aggs_dtype(string_dtype, op): + psr = pd.Series( + ["x", "y", "x", None, "z"], + dtype=string_dtype, + name="b", + ) + pkeys = pd.Series([1, 1, 2, 2, 3], name="a") + gsr = cudf.from_pandas(psr) + gkeys = cudf.from_pandas(pkeys) + with cudf.option_context("mode.pandas_compatible", True): + got = getattr(gsr.groupby(gkeys), op)() + expect = getattr(psr.groupby(pkeys), op)() + assert_eq(expect, got) + + +def test_groupby_series_identity_column_exclusion(): + pdf = pd.DataFrame( + {"a": [1, 1, 2, 2, 3, 3], "b": [10, 20, 30, 40, 50, 60]} + ) + gdf = cudf.from_pandas(pdf) + with cudf.option_context("mode.pandas_compatible", True): + got = gdf.groupby(gdf["a"]).sum() + expect = pdf.groupby(pdf["a"]).sum() + assert_eq(expect, got) + + +def test_groupby_series_copy_no_column_exclusion(): + pdf = pd.DataFrame( + {"a": [1, 1, 2, 2, 3, 3], "b": [10, 20, 30, 40, 50, 60]} + ) + gdf = cudf.from_pandas(pdf) + with cudf.option_context("mode.pandas_compatible", True): + got = gdf.groupby(gdf["a"].copy()).sum() + expect = pdf.groupby(pdf["a"].copy()).sum() + assert_eq(expect, got) + + +def test_groupby_series_self_does_not_exclude(): + psr = pd.Series([1, 1, 2, 2, 3, 3], name="a") + gsr = cudf.from_pandas(psr) + with cudf.option_context("mode.pandas_compatible", True): + got = gsr.groupby(gsr).count() + expect = psr.groupby(psr).count() + + @pytest.mark.parametrize("op", ["sum", "min", "max", "first", "last"]) @pytest.mark.parametrize("min_count", [0, 1, 2, 3, 5]) def test_groupby_reduce_min_count(op, min_count):