Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion python/cudf/cudf/core/accessors/base_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,14 @@ def _return_or_inplace( # type: ignore[misc]
index=self._parent.index, # type: ignore[union-attr]
attrs=self._parent.attrs, # type: ignore[union-attr]
)
if len(table) == 0:
keys = (
tuple(table.keys()) if hasattr(table, "keys") else ()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
tuple(table.keys()) if hasattr(table, "keys") else ()
tuple(table.keys()) if isinstance(table, dict) else ()

Could we use this stricter check?

)
if len(table) == 0 or (
keys
and all(isinstance(k, int) for k in keys)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe this check is redundant with the one below. The equality comparison below should be false if keys did not contain ints

and tuple(keys) == tuple(range(len(keys)))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
and tuple(keys) == tuple(range(len(keys)))
and keys == tuple(range(len(keys)))

(Since the keys assignment above creates it as a tuple already)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also this does have the assumption that the columns are always 0..n (e.g. maybe the rangeindex could be 1...n + 1), but that can be tackled in a follow up

):
df._data.rangeindex = True
return df
elif isinstance(self._parent, cudf.Series):
Expand Down
46 changes: 44 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2126,6 +2126,15 @@ def _concat(
# Reassign index and column names
if objs[0]._data.multiindex:
out._set_columns_like(objs[0]._data)
elif (
all(obj._data.rangeindex for obj in objs)
and all(
tuple(obj._column_names) == tuple(range(obj._num_columns))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the first all check confirms obj has a RangeIndex columns, and we're only going to compare against 0..n (which as mentioned above might be limiting), we could make this check quicker by just checking obj._column_names[0] == 0 and obj._column_names[-1] == obj._num_columns - 1) instead of all the materialized range values

for obj in objs
)
and tuple(names) == tuple(range(len(names)))
):
out.columns = cudf.RangeIndex(len(names))
else:
out.columns = names
if not ignore_index:
Expand Down Expand Up @@ -2419,6 +2428,9 @@ def _fill_same_ca_attributes(
else:
raise ValueError("other must be a DataFrame or Series.")

if isinstance(column_names_list, pd.MultiIndex):
ca_attributes["multiindex"] = True
ca_attributes["level_names"] = tuple(column_names_list.names)
sorted_dict = {key: operands[key] for key in column_names_list}
return sorted_dict, index, ca_attributes
return operands, index, ca_attributes
Expand Down Expand Up @@ -4805,6 +4817,23 @@ def join(
df.index.name = (
None if self.index.name != other.index.name else self.index.name
)

# Preserve a CategoricalIndex columns axis when both inputs share the
# same categorical dtype on their column labels (matches pandas).
self_pd_cols = self._data.to_pandas_index
other_pd_cols = other._data.to_pandas_index
if (
isinstance(self_pd_cols, pd.CategoricalIndex)
and isinstance(other_pd_cols, pd.CategoricalIndex)
and self_pd_cols.dtype == other_pd_cols.dtype
):
df.columns = pd.CategoricalIndex(
list(self_pd_cols) + list(other_pd_cols),
dtype=self_pd_cols.dtype,
name=self_pd_cols.name
if self_pd_cols.name == other_pd_cols.name
else None,
)
Comment on lines +4830 to +4836
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
df.columns = pd.CategoricalIndex(
list(self_pd_cols) + list(other_pd_cols),
dtype=self_pd_cols.dtype,
name=self_pd_cols.name
if self_pd_cols.name == other_pd_cols.name
else None,
)
df.columns = self_pd_cols.append(other_pd_cols)

Should give you the same result

return df

@_performance_tracking
Expand Down Expand Up @@ -6369,7 +6398,15 @@ def quantile(
if len(res) == 0:
res = column_empty(row_count=len(qs), dtype=ser.dtype)
result[k] = res
result = DataFrame._from_data(result, attrs=self.attrs)
result_ca = ColumnAccessor(
result,
multiindex=data_df._data.multiindex,
level_names=data_df._data.level_names,
rangeindex=data_df._data.rangeindex,
label_dtype=data_df._data.label_dtype,
verify=False,
)
result = DataFrame._from_data(result_ca, attrs=self.attrs)

if q_is_number and numeric_only:
result = result.fillna(np.nan).iloc[0]
Expand Down Expand Up @@ -7233,7 +7270,12 @@ def cudf_dtype_from_pydata_dtype(dtype):
for label, dtype in self._dtypes
if cudf_dtype_from_pydata_dtype(dtype) in inclusion
]
return self.loc[:, to_select]
result = self.loc[:, to_select]
if not to_select and self._data.rangeindex:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally I would hope loc preserved the .rangeindex but that could be for another PR

# Preserve RangeIndex columns through an empty selection so that
# downstream operations match pandas' column metadata.
result._data.rangeindex = True
return result

@ioutils.doc_to_parquet()
def to_parquet(
Expand Down
50 changes: 47 additions & 3 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1111,12 +1111,27 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
if cast_dtype is not None:
result_col = result_col.astype(cast_dtype)
data[key] = result_col
data = ColumnAccessor(data, multiindex=multilevel)
from cudf.core.dataframe import DataFrame

# Preserve the column axis label-dtype/level_names from the source
# DataFrame so that aggregations such as ``nunique`` keep the column
# axis name (matching pandas behavior).
if (
not multilevel
and isinstance(self.obj, DataFrame)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
and isinstance(self.obj, DataFrame)
and self.obj.ndim == 2

nit (to avoid the DataFrame runtime import)

and self.obj._data.level_names != (None,)
):
data = ColumnAccessor(
data,
multiindex=False,
level_names=self.obj._data.level_names,
label_dtype=self.obj._data.label_dtype,
)
else:
data = ColumnAccessor(data, multiindex=multilevel)
if not multilevel:
data = data.rename_levels({np.nan: None}, level=0)

from cudf.core.dataframe import DataFrame

result = DataFrame._from_data(data, index=result_index)

if self._sort:
Expand Down Expand Up @@ -2753,6 +2768,8 @@ def _scan_fill(
) -> DataFrameOrSeries:
"""Internal implementation for `ffill` and `bfill`"""
values = self.grouping.values
from cudf.core.dataframe import DataFrame

result = self.obj._from_data(
dict(
zip(
Expand All @@ -2762,6 +2779,33 @@ def _scan_fill(
)
)
)
# Pandas' groupby.ffill/bfill builds the result columns via a ``take``
# on the input columns, which converts integer-valued column labels
# to object dtype. Reproduce that here so column metadata matches.
if (
isinstance(result, DataFrame)
and isinstance(self.obj, DataFrame)
and result._num_columns < self.obj._num_columns
):
source_pd_cols = self.obj._data.to_pandas_index
if (
source_pd_cols.dtype.kind in {"i", "u"}
or source_pd_cols.dtype == object
):
try:
positions = [
source_pd_cols.get_loc(c) for c in result._column_names
]
Comment on lines +2796 to +2798
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might be able to to the same with

indexer = source_pd_cols.get_indexer(result._column_names)
if not (indexer == -1).any():
    taken = source_pd_cols.take(positions)
    ...

except (KeyError, TypeError):
positions = None
if positions is not None:
taken = source_pd_cols.take(positions)
if (
not isinstance(taken, pd.MultiIndex)
and taken.dtype != object
):
taken = taken.astype(object)
result.columns = taken
return self._mimic_pandas_order(result)

def ffill(self, limit: int | None = None):
Expand Down
9 changes: 9 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4013,11 +4013,13 @@ def _reindex(

index = index if index is not None else df.index

label_dtype = None
if column_names is None:
names = list(df._column_names)
level_names = self._data.level_names
multiindex = self._data.multiindex
rangeindex = self._data.rangeindex
label_dtype = self._data.label_dtype
elif isinstance(column_names, (pd.Index, cudf.Index)):
if isinstance(column_names, (pd.MultiIndex, cudf.MultiIndex)):
multiindex = True
Expand All @@ -4034,6 +4036,12 @@ def _reindex(
rangeindex = isinstance(
column_names, (pd.RangeIndex, cudf.RangeIndex)
)
if not rangeindex:
label_dtype = (
column_names.dtype
if isinstance(column_names, pd.Index)
else column_names.to_pandas().dtype
)
level_names = tuple(column_names.names)
else:
names = column_names
Expand Down Expand Up @@ -4066,6 +4074,7 @@ def _reindex(
multiindex=multiindex,
level_names=level_names,
rangeindex=rangeindex,
label_dtype=label_dtype,
),
index=index,
attrs=self.attrs,
Expand Down
32 changes: 31 additions & 1 deletion python/cudf/cudf/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,10 @@ def _normalize_series_and_dataframe(
name = obj.name
if name is None:
if axis == 0:
name = 0
# Preserve "unnamed" semantics so the resulting frame has
# a RangeIndex columns object (matching pandas).
objs[idx] = obj.to_frame()
continue
else:
name = sr_name
sr_name += 1
Expand Down Expand Up @@ -1063,12 +1066,39 @@ def pivot(
index_data = index_data.get_level_values(0)
else:
index_data = cudf.Index(index_data)
# An entirely empty input pivots to an empty result. Pandas uses the
# default ``object`` dtype for the resulting index axis in that case;
# mirror this so index metadata (dtype/inferred_type) matches.
if (
len(data) == 0
and not isinstance(index_data, cudf.MultiIndex)
and isinstance(index_data.dtype, pd.StringDtype)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this to apply to any of the string types and not just pd.StringDtype?

):
index_data = cudf.Index(
pd.Index([], name=index_data.name, dtype=object)
)

column_data = data.loc[:, columns]
# When `columns` is a scalar but the source DataFrame has a MultiIndex on
# the row axis, ``loc`` may return a 2-D selection in cuDF. Treat the
# selection as 1-D so we end up with a flat Index of column labels.
if is_scalar(columns) and column_data.ndim == 2:
column_data = column_data.iloc[:, 0]
if column_data.ndim == 2:
column_data = cudf.MultiIndex.from_frame(column_data)
else:
column_data = cudf.Index(column_data)
# An entirely empty input pivots to an empty result. Pandas reports the
# default ``object`` dtype for the resulting columns axis in that case;
# mirror this so column metadata (dtype/inferred_type) matches.
if (
len(data) == 0
and not isinstance(column_data, cudf.MultiIndex)
and isinstance(column_data.dtype, pd.StringDtype)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question here

):
column_data = cudf.Index(
pd.Index([], name=column_data.name, dtype=object)
)

# Create a DataFrame composed of columns from both
# columns and index
Expand Down
15 changes: 13 additions & 2 deletions python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,11 +254,22 @@ def tolist(self) -> None:
def _to_frame(self, name: Hashable, index: Index | None) -> DataFrame:
"""Helper function for Series.to_frame, Index.to_frame"""

unnamed_default = False
col_name: Hashable
if name is no_default:
col_name = 0 if self.name is None else self.name
if self.name is None:
col_name = 0
unnamed_default = True
else:
col_name = self.name
else:
col_name = name
ca = ColumnAccessor({col_name: self._column}, verify=False)
ca = ColumnAccessor(
{col_name: self._column},
multiindex=isinstance(col_name, tuple),
rangeindex=unnamed_default,
verify=False,
)
# TODO: Avoid accessing DataFrame from the top level namespace
return cudf.DataFrame._from_data(ca, index=index)

Expand Down
Loading
Loading