Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 141 additions & 1 deletion python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import pylibcudf as plc

from cudf.api.types import is_list_like, is_scalar
from cudf.core._internals import aggregation, sorting
from cudf.core._internals import aggregation, binaryop, sorting
from cudf.core.abc import Serializable
from cudf.core.column import access_columns
from cudf.core.column.column import (
Expand Down Expand Up @@ -1249,6 +1249,12 @@ def _reduce(
"""
if numeric_only:
return self._reduce_numeric_only(op)

if op == "sum" and self._has_string_value_column():
return self._string_sum(
skipna=kwargs.get("skipna", True), min_count=min_count
)

result = self.agg(op)
if min_count and min_count > 0:
counts = self.agg("count")
Expand All @@ -1259,6 +1265,140 @@ def _scan(self, op: str, *args, **kwargs):
"""{op_name} for each group."""
return self.agg(op)

def _has_string_value_column(self) -> bool:
from cudf.core.series import Series

if isinstance(self.obj, Series):
return isinstance(self.obj.dtype, pd.StringDtype)
for col_name in self.grouping._values_column_names:
if isinstance(self.obj._data[col_name].dtype, pd.StringDtype):
return True
return False

def _string_sum(self, *, skipna: bool, min_count: int):
"""Implement groupby sum for StringDtype columns as per-group
string concatenation.
"""
from cudf.core.column import ColumnBase
from cudf.core.dataframe import DataFrame
from cudf.core.series import Series

is_series = isinstance(self.obj, Series)

def _concat_column(string_col):
# Group into a list<string> column using collect_list and then
# join the list elements per group.
requests = [
plc.groupby.GroupByRequest(
string_col.plc_column,
[plc.aggregation.collect_list()],
)
]
with access_columns(string_col, mode="read", scope="internal"):
with self._groupby_manager as plc_groupby:
keys, results = plc_groupby.aggregate(requests)
list_col = results[0].columns()[0]
sep = plc.Scalar.from_py("")
sep_narep = plc.Scalar.from_py("")
if skipna:
string_narep = plc.Scalar.from_py("")
empty_policy = (
plc.strings.combine.OutputIfEmptyList.EMPTY_STRING
)
else:
string_narep = plc.Scalar.from_py(
None, plc.DataType(plc.TypeId.STRING)
)
empty_policy = (
plc.strings.combine.OutputIfEmptyList.NULL_ELEMENT
)
joined = plc.strings.combine.join_list_elements(
list_col,
sep,
sep_narep,
string_narep,
plc.strings.combine.SeparatorOnNulls.YES,
empty_policy,
)
return ColumnBase.create(joined, string_col.dtype), keys

def _apply_min_count(result_col, string_col, keys):
if min_count <= 0:
return result_col
count_req = [
plc.groupby.GroupByRequest(
string_col.plc_column, [plc.aggregation.count()]
)
]
with access_columns(string_col, mode="read", scope="internal"):
with self._groupby_manager as plc_groupby:
_, count_results = plc_groupby.aggregate(count_req)
count_plc = count_results[0].columns()[0]
count_col = ColumnBase.create(
count_plc, dtype_from_pylibcudf_column(count_plc)
)
keep_mask = binaryop.binaryop(
count_col,
plc.Scalar.from_py(min_count),
"__ge__",
np.dtype(np.bool_),
)
null_str = plc.Scalar.from_py(
None, plc.DataType(plc.TypeId.STRING)
)
return result_col.copy_if_else(null_str, keep_mask)

# Key index is shared across all aggregations.
key_dtypes = [col.dtype for col in self.grouping._key_columns]
keys_cache = None

def _group_and_join(string_col):
nonlocal keys_cache
result_col, keys = _concat_column(string_col)
if keys_cache is None:
keys_cache = keys
result_col = _apply_min_count(result_col, string_col, keys)
return result_col

if is_series:
string_col = self.obj._column
out_col = _group_and_join(string_col)
assert keys_cache is not None
index = self.grouping.keys._from_columns_like_self(
[
ColumnBase.create(key, dtype)
for key, dtype in zip(
keys_cache.columns(), key_dtypes, strict=True
)
]
)
return Series._from_column(
out_col, name=self.obj.name, index=index
)
else:
out_data = {}
for col_name in self.grouping._values_column_names:
col = self.obj._data[col_name]
if isinstance(col.dtype, pd.StringDtype):
out_data[col_name] = _group_and_join(col)
else:
# Non-string columns go through normal agg path.
# TODO: handle mixed dtype frames
raise NotImplementedError(
"sum on mixed string and non-string columns is "
"not yet supported"
)
assert keys_cache is not None
index = self.grouping.keys._from_columns_like_self(
[
ColumnBase.create(key, dtype)
for key, dtype in zip(
keys_cache.columns(), key_dtypes, strict=True
)
]
)
return DataFrame._from_data(out_data, index=index)

aggregate = agg

def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
Expand Down
16 changes: 0 additions & 16 deletions python/cudf/cudf/pandas/scripts/conftest-patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -3605,10 +3605,6 @@ def pytest_unconfigure(config):
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-True-True-0]": "AssertionError: Attributes of Series are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-nunique-True-True-1]": "AssertionError: Attributes of Series are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[pyarrow]-sum-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-all-True-False-0]": "AssertionError: DataFrame are different",
Expand Down Expand Up @@ -3653,10 +3649,6 @@ def pytest_unconfigure(config):
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-True-True-0]": "AssertionError: Attributes of Series are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-nunique-True-True-1]": "AssertionError: Attributes of Series are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=str[python]-sum-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-all-True-False-0]": "AssertionError: DataFrame are different",
Expand Down Expand Up @@ -3709,10 +3701,6 @@ def pytest_unconfigure(config):
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-size-False-True-1]": "AssertionError: Attributes of Series are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-size-True-True-0]": "AssertionError: Attributes of Series are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-size-True-True-1]": "AssertionError: Attributes of Series are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[pyarrow]-sum-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-all-True-False-0]": "AssertionError: DataFrame are different",
Expand Down Expand Up @@ -3761,10 +3749,6 @@ def pytest_unconfigure(config):
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-True-True-0]": "AssertionError: Attributes of Series are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-nunique-True-True-1]": "AssertionError: Attributes of Series are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-sum-False-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-sum-False-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-sum-True-False-0]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_string_dtype_all_na[string=string[python]-sum-True-False-1]": "AssertionError: DataFrame are different",
"tests/groupby/test_reductions.py::test_sum_skipna[False-values0-float64]": "AssertionError: Series are different",
"tests/groupby/test_reductions.py::test_sum_skipna[False-values3-timedelta64[ns]]": "AssertionError: Series are different",
"tests/groupby/test_reductions.py::test_sum_skipna_object[False]": "AssertionError: Series are different",
Expand Down
33 changes: 31 additions & 2 deletions python/cudf/cudf/tests/groupby/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -911,8 +911,14 @@ def test_group_by_empty_reduction(
)
request.applymarker(
pytest.mark.xfail(
condition=all_supported_types_as_str in {"str", "category"}
and groupby_reduction_methods in {"sum", "prod", "mean"},
condition=(
all_supported_types_as_str in {"str", "category"}
and groupby_reduction_methods in {"prod", "mean"}
)
or (
all_supported_types_as_str == "category"
and groupby_reduction_methods == "sum"
),
raises=TypeError,
reason=f"{all_supported_types_as_str} raises TypeError with {groupby_reduction_methods}",
)
Expand Down Expand Up @@ -1191,6 +1197,29 @@ def test_string_groupby_key_index():
assert_eq(expect, got, check_dtype=False)


@pytest.mark.parametrize(
"string_dtype",
[
pd.StringDtype(storage="python", na_value=pd.NA),
pd.StringDtype(storage="python", na_value=np.nan),
pd.StringDtype(storage="pyarrow", na_value=pd.NA),
pd.StringDtype(storage="pyarrow", na_value=np.nan),
],
)
def test_groupby_string_sum(string_dtype):
pdf = pd.DataFrame(
{
"a": [1, 1, 2, 2, 3],
"b": pd.array(["x", "y", "", "z", "q"], dtype=string_dtype),
}
)
gdf = cudf.from_pandas(pdf)
with cudf.option_context("mode.pandas_compatible", True):
got = gdf.groupby("a").sum()
expect = pdf.groupby("a").sum()
assert_eq(expect, got)


@pytest.mark.parametrize("op", ["sum", "min", "max", "first", "last"])
@pytest.mark.parametrize("min_count", [0, 1, 2, 3, 5])
def test_groupby_reduce_min_count(op, min_count):
Expand Down
Loading