-
Notifications
You must be signed in to change notification settings - Fork 1.1k
Fix AssertionError: DataFrame.columns are different failures in cudf.pandas
#22351
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: pandas3
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -98,7 +98,14 @@ def _return_or_inplace( # type: ignore[misc] | |||||
| index=self._parent.index, # type: ignore[union-attr] | ||||||
| attrs=self._parent.attrs, # type: ignore[union-attr] | ||||||
| ) | ||||||
| if len(table) == 0: | ||||||
| keys = ( | ||||||
| tuple(table.keys()) if hasattr(table, "keys") else () | ||||||
| ) | ||||||
| if len(table) == 0 or ( | ||||||
| keys | ||||||
| and all(isinstance(k, int) for k in keys) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe this check is redundant with the one below. The equality comparison below should be false if keys did not contain ints |
||||||
| and tuple(keys) == tuple(range(len(keys))) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
(Since the keys assignment above creates it as a tuple already)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also this does have the assumption that the columns are always |
||||||
| ): | ||||||
| df._data.rangeindex = True | ||||||
| return df | ||||||
| elif isinstance(self._parent, cudf.Series): | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -2126,6 +2126,15 @@ def _concat( | |||||||||||||||||
| # Reassign index and column names | ||||||||||||||||||
| if objs[0]._data.multiindex: | ||||||||||||||||||
| out._set_columns_like(objs[0]._data) | ||||||||||||||||||
| elif ( | ||||||||||||||||||
| all(obj._data.rangeindex for obj in objs) | ||||||||||||||||||
| and all( | ||||||||||||||||||
| tuple(obj._column_names) == tuple(range(obj._num_columns)) | ||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If the first |
||||||||||||||||||
| for obj in objs | ||||||||||||||||||
| ) | ||||||||||||||||||
| and tuple(names) == tuple(range(len(names))) | ||||||||||||||||||
| ): | ||||||||||||||||||
| out.columns = cudf.RangeIndex(len(names)) | ||||||||||||||||||
| else: | ||||||||||||||||||
| out.columns = names | ||||||||||||||||||
| if not ignore_index: | ||||||||||||||||||
|
|
@@ -2419,6 +2428,9 @@ def _fill_same_ca_attributes( | |||||||||||||||||
| else: | ||||||||||||||||||
| raise ValueError("other must be a DataFrame or Series.") | ||||||||||||||||||
|
|
||||||||||||||||||
| if isinstance(column_names_list, pd.MultiIndex): | ||||||||||||||||||
| ca_attributes["multiindex"] = True | ||||||||||||||||||
| ca_attributes["level_names"] = tuple(column_names_list.names) | ||||||||||||||||||
| sorted_dict = {key: operands[key] for key in column_names_list} | ||||||||||||||||||
| return sorted_dict, index, ca_attributes | ||||||||||||||||||
| return operands, index, ca_attributes | ||||||||||||||||||
|
|
@@ -4805,6 +4817,23 @@ def join( | |||||||||||||||||
| df.index.name = ( | ||||||||||||||||||
| None if self.index.name != other.index.name else self.index.name | ||||||||||||||||||
| ) | ||||||||||||||||||
|
|
||||||||||||||||||
| # Preserve a CategoricalIndex columns axis when both inputs share the | ||||||||||||||||||
| # same categorical dtype on their column labels (matches pandas). | ||||||||||||||||||
| self_pd_cols = self._data.to_pandas_index | ||||||||||||||||||
| other_pd_cols = other._data.to_pandas_index | ||||||||||||||||||
| if ( | ||||||||||||||||||
| isinstance(self_pd_cols, pd.CategoricalIndex) | ||||||||||||||||||
| and isinstance(other_pd_cols, pd.CategoricalIndex) | ||||||||||||||||||
| and self_pd_cols.dtype == other_pd_cols.dtype | ||||||||||||||||||
| ): | ||||||||||||||||||
| df.columns = pd.CategoricalIndex( | ||||||||||||||||||
| list(self_pd_cols) + list(other_pd_cols), | ||||||||||||||||||
| dtype=self_pd_cols.dtype, | ||||||||||||||||||
| name=self_pd_cols.name | ||||||||||||||||||
| if self_pd_cols.name == other_pd_cols.name | ||||||||||||||||||
| else None, | ||||||||||||||||||
| ) | ||||||||||||||||||
|
Comment on lines
+4830
to
+4836
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Should give you the same result |
||||||||||||||||||
| return df | ||||||||||||||||||
|
|
||||||||||||||||||
| @_performance_tracking | ||||||||||||||||||
|
|
@@ -6369,7 +6398,15 @@ def quantile( | |||||||||||||||||
| if len(res) == 0: | ||||||||||||||||||
| res = column_empty(row_count=len(qs), dtype=ser.dtype) | ||||||||||||||||||
| result[k] = res | ||||||||||||||||||
| result = DataFrame._from_data(result, attrs=self.attrs) | ||||||||||||||||||
| result_ca = ColumnAccessor( | ||||||||||||||||||
| result, | ||||||||||||||||||
| multiindex=data_df._data.multiindex, | ||||||||||||||||||
| level_names=data_df._data.level_names, | ||||||||||||||||||
| rangeindex=data_df._data.rangeindex, | ||||||||||||||||||
| label_dtype=data_df._data.label_dtype, | ||||||||||||||||||
| verify=False, | ||||||||||||||||||
| ) | ||||||||||||||||||
| result = DataFrame._from_data(result_ca, attrs=self.attrs) | ||||||||||||||||||
|
|
||||||||||||||||||
| if q_is_number and numeric_only: | ||||||||||||||||||
| result = result.fillna(np.nan).iloc[0] | ||||||||||||||||||
|
|
@@ -7233,7 +7270,12 @@ def cudf_dtype_from_pydata_dtype(dtype): | |||||||||||||||||
| for label, dtype in self._dtypes | ||||||||||||||||||
| if cudf_dtype_from_pydata_dtype(dtype) in inclusion | ||||||||||||||||||
| ] | ||||||||||||||||||
| return self.loc[:, to_select] | ||||||||||||||||||
| result = self.loc[:, to_select] | ||||||||||||||||||
| if not to_select and self._data.rangeindex: | ||||||||||||||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ideally I would hope |
||||||||||||||||||
| # Preserve RangeIndex columns through an empty selection so that | ||||||||||||||||||
| # downstream operations match pandas' column metadata. | ||||||||||||||||||
| result._data.rangeindex = True | ||||||||||||||||||
| return result | ||||||||||||||||||
|
|
||||||||||||||||||
| @ioutils.doc_to_parquet() | ||||||||||||||||||
| def to_parquet( | ||||||||||||||||||
|
|
||||||||||||||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -1111,12 +1111,27 @@ def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs): | |||||
| if cast_dtype is not None: | ||||||
| result_col = result_col.astype(cast_dtype) | ||||||
| data[key] = result_col | ||||||
| data = ColumnAccessor(data, multiindex=multilevel) | ||||||
| from cudf.core.dataframe import DataFrame | ||||||
|
|
||||||
| # Preserve the column axis label-dtype/level_names from the source | ||||||
| # DataFrame so that aggregations such as ``nunique`` keep the column | ||||||
| # axis name (matching pandas behavior). | ||||||
| if ( | ||||||
| not multilevel | ||||||
| and isinstance(self.obj, DataFrame) | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
nit (to avoid the DataFrame runtime import) |
||||||
| and self.obj._data.level_names != (None,) | ||||||
| ): | ||||||
| data = ColumnAccessor( | ||||||
| data, | ||||||
| multiindex=False, | ||||||
| level_names=self.obj._data.level_names, | ||||||
| label_dtype=self.obj._data.label_dtype, | ||||||
| ) | ||||||
| else: | ||||||
| data = ColumnAccessor(data, multiindex=multilevel) | ||||||
| if not multilevel: | ||||||
| data = data.rename_levels({np.nan: None}, level=0) | ||||||
|
|
||||||
| from cudf.core.dataframe import DataFrame | ||||||
|
|
||||||
| result = DataFrame._from_data(data, index=result_index) | ||||||
|
|
||||||
| if self._sort: | ||||||
|
|
@@ -2753,6 +2768,8 @@ def _scan_fill( | |||||
| ) -> DataFrameOrSeries: | ||||||
| """Internal implementation for `ffill` and `bfill`""" | ||||||
| values = self.grouping.values | ||||||
| from cudf.core.dataframe import DataFrame | ||||||
|
|
||||||
| result = self.obj._from_data( | ||||||
| dict( | ||||||
| zip( | ||||||
|
|
@@ -2762,6 +2779,33 @@ def _scan_fill( | |||||
| ) | ||||||
| ) | ||||||
| ) | ||||||
| # Pandas' groupby.ffill/bfill builds the result columns via a ``take`` | ||||||
| # on the input columns, which converts integer-valued column labels | ||||||
| # to object dtype. Reproduce that here so column metadata matches. | ||||||
| if ( | ||||||
| isinstance(result, DataFrame) | ||||||
| and isinstance(self.obj, DataFrame) | ||||||
| and result._num_columns < self.obj._num_columns | ||||||
| ): | ||||||
| source_pd_cols = self.obj._data.to_pandas_index | ||||||
| if ( | ||||||
| source_pd_cols.dtype.kind in {"i", "u"} | ||||||
| or source_pd_cols.dtype == object | ||||||
| ): | ||||||
| try: | ||||||
| positions = [ | ||||||
| source_pd_cols.get_loc(c) for c in result._column_names | ||||||
| ] | ||||||
|
Comment on lines
+2796
to
+2798
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You might be able to to the same with indexer = source_pd_cols.get_indexer(result._column_names)
if not (indexer == -1).any():
taken = source_pd_cols.take(positions)
... |
||||||
| except (KeyError, TypeError): | ||||||
| positions = None | ||||||
| if positions is not None: | ||||||
| taken = source_pd_cols.take(positions) | ||||||
| if ( | ||||||
| not isinstance(taken, pd.MultiIndex) | ||||||
| and taken.dtype != object | ||||||
| ): | ||||||
| taken = taken.astype(object) | ||||||
| result.columns = taken | ||||||
| return self._mimic_pandas_order(result) | ||||||
|
|
||||||
| def ffill(self, limit: int | None = None): | ||||||
|
|
||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -129,7 +129,10 @@ def _normalize_series_and_dataframe( | |
| name = obj.name | ||
| if name is None: | ||
| if axis == 0: | ||
| name = 0 | ||
| # Preserve "unnamed" semantics so the resulting frame has | ||
| # a RangeIndex columns object (matching pandas). | ||
| objs[idx] = obj.to_frame() | ||
| continue | ||
| else: | ||
| name = sr_name | ||
| sr_name += 1 | ||
|
|
@@ -1063,12 +1066,39 @@ def pivot( | |
| index_data = index_data.get_level_values(0) | ||
| else: | ||
| index_data = cudf.Index(index_data) | ||
| # An entirely empty input pivots to an empty result. Pandas uses the | ||
| # default ``object`` dtype for the resulting index axis in that case; | ||
| # mirror this so index metadata (dtype/inferred_type) matches. | ||
| if ( | ||
| len(data) == 0 | ||
| and not isinstance(index_data, cudf.MultiIndex) | ||
| and isinstance(index_data.dtype, pd.StringDtype) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this to apply to any of the string types and not just |
||
| ): | ||
| index_data = cudf.Index( | ||
| pd.Index([], name=index_data.name, dtype=object) | ||
| ) | ||
|
|
||
| column_data = data.loc[:, columns] | ||
| # When `columns` is a scalar but the source DataFrame has a MultiIndex on | ||
| # the row axis, ``loc`` may return a 2-D selection in cuDF. Treat the | ||
| # selection as 1-D so we end up with a flat Index of column labels. | ||
| if is_scalar(columns) and column_data.ndim == 2: | ||
| column_data = column_data.iloc[:, 0] | ||
| if column_data.ndim == 2: | ||
| column_data = cudf.MultiIndex.from_frame(column_data) | ||
| else: | ||
| column_data = cudf.Index(column_data) | ||
| # An entirely empty input pivots to an empty result. Pandas reports the | ||
| # default ``object`` dtype for the resulting columns axis in that case; | ||
| # mirror this so column metadata (dtype/inferred_type) matches. | ||
| if ( | ||
| len(data) == 0 | ||
| and not isinstance(column_data, cudf.MultiIndex) | ||
| and isinstance(column_data.dtype, pd.StringDtype) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same question here |
||
| ): | ||
| column_data = cudf.Index( | ||
| pd.Index([], name=column_data.name, dtype=object) | ||
| ) | ||
|
|
||
| # Create a DataFrame composed of columns from both | ||
| # columns and index | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could we use this stricter check?