diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index aa1580556c0..4dd00968c59 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -85,11 +85,11 @@ dtype_to_pylibcudf_type, find_common_type, get_dtype_of_same_kind, + is_arrow_null_dtype, is_column_like, is_mixed_with_object_dtype, is_pandas_nullable_extension_dtype, is_pandas_nullable_numpy_dtype, - maybe_normalize_arrow_null, min_signed_type, np_dtypes_to_pandas_dtypes, pyarrow_dtype_to_cudf_dtype, @@ -378,13 +378,9 @@ def _wrap_and_validate(col: plc.Column, dtype: DtypeObj) -> plc.Column: "Normalize to np.dtype('O') before calling " "ColumnBase.create." ) - if isinstance(dtype, pd.ArrowDtype) and pa.types.is_null( - dtype.pyarrow_dtype - ): + if is_arrow_null_dtype(dtype) and col.null_count() != col.size(): raise ValueError( - f"dtype {dtype} is a pandas nullable string dtype with all nulls. " - "Normalize to an empty string column with the same pandas StringDtype " - "before calling ColumnBase.create." + f"dtype {dtype} can only be used with all-null columns." ) dtype_kind = dtype.kind @@ -961,15 +957,11 @@ def create( like copy-on-write. When validation is disabled, the caller is responsible for ensuring that col and its children are already normalized and wrapped. """ - # For pandas nullable null types (ArrowDtype wrapping pa.null()), - # normalize the column data and dtype before construction. - col, dtype, old_dtype = maybe_normalize_arrow_null(col, dtype) - # Dispatch to the appropriate subclass based on dtype target_cls = ColumnBase._dispatch_subclass_from_dtype(dtype) self = target_cls.__new__(target_cls) self.plc_column = _wrap_and_validate(col, dtype) if validate else col - self._dtype = dtype if old_dtype is None else old_dtype + self._dtype = dtype self._distinct_count = {} self._has_nulls = {} # The set of exposed buffers associated with this column. These buffers must be @@ -1419,6 +1411,8 @@ def dropna(self) -> Self: return self.copy() def to_arrow(self) -> pa.Array: + if is_arrow_null_dtype(self.dtype): + return pa.nulls(len(self)) with self.access(mode="read", scope="internal"): return _handle_nulls( self.plc_column.to_arrow( @@ -3323,6 +3317,12 @@ def as_column( elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)): if isinstance(arbitrary, pa.NullArray) and dtype is None: dtype = np.dtype("object") + elif is_arrow_null_dtype(dtype): + if arbitrary.null_count != len(arbitrary): + raise ValueError( + f"dtype {dtype} can only be used with all-null data." + ) + arbitrary = pa.nulls(len(arbitrary)) column = ColumnBase.from_arrow(arbitrary) if nan_as_null is not False: column = column.nans_to_nulls() @@ -3536,6 +3536,11 @@ def as_column( elif length < 0: raise ValueError(f"{length=} must be >=0.") + if is_arrow_null_dtype(dtype): + if is_na_like(arbitrary): + return column_empty(length, dtype=dtype) + pa.scalar(arbitrary, type=dtype.pyarrow_dtype) + pa_type = None if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype): return as_column( @@ -3768,6 +3773,13 @@ def as_column( from_pandas = nan_as_null is None or nan_as_null if dtype is not None: + if is_arrow_null_dtype(dtype): + arbitrary = pa.array( + arbitrary, + type=dtype.pyarrow_dtype, + from_pandas=True, + ) + return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype) try: arbitrary = pa.array( arbitrary, diff --git a/python/cudf/cudf/core/dtype/validators.py b/python/cudf/cudf/core/dtype/validators.py index d5470131c61..44cde19d777 100644 --- a/python/cudf/cudf/core/dtype/validators.py +++ b/python/cudf/cudf/core/dtype/validators.py @@ -35,6 +35,7 @@ def is_dtype_obj_string(obj: DtypeObj) -> bool: and ( pa.types.is_string(obj.pyarrow_dtype) or pa.types.is_large_string(obj.pyarrow_dtype) + or pa.types.is_null(obj.pyarrow_dtype) ) ) ) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 57ef45930dc..4e2d7f5747e 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -6574,12 +6574,16 @@ def convert_dtypes( if dtype_backend == "pyarrow": cols = [] for col in self._columns: - arrow_dtype = pd.ArrowDtype( - pa.null() - if col.null_count == len(col) - else cudf_dtype_to_pa_type(col.dtype) - ) - cols.append(ColumnBase.create(col.plc_column, arrow_dtype)) + if len(col) == 0 and is_dtype_obj_string(col.dtype): + cols.append(col) + continue + if len(col) != 0 and col.null_count == len(col): + cols.append(as_column(col, dtype=pd.ArrowDtype(pa.null()))) + else: + arrow_dtype = pd.ArrowDtype( + cudf_dtype_to_pa_type(col.dtype) + ) + cols.append(ColumnBase.create(col.plc_column, arrow_dtype)) return self._from_data_like_self( self._data._from_columns_like_self(cols, verify=False) ) diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 47f0e4b25a2..c06f562ebbc 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -723,7 +723,6 @@ def pytest_unconfigure(config): "tests/base/test_constructors.py::TestConstruction::test_constructor_datetime_outofbound[Series-object-string]": "Failed: DID NOT RAISE ", "tests/base/test_conversion.py::test_array[index-arr3-_left]": "TODO: Add a reason for failure", "tests/base/test_conversion.py::test_array[index-arr4-_sparse_values]": "TODO: Add a reason for failure", - "tests/base/test_conversion.py::test_array[series-arr3-_left]": "TODO: Add a reason for failure", "tests/base/test_conversion.py::test_array[series-arr4-_sparse_values]": "TODO: Add a reason for failure", "tests/base/test_conversion.py::test_to_numpy[array-False-arr1-expected1-False]": "AssertionError: numpy array are different", "tests/base/test_conversion.py::test_to_numpy[array-True-arr1-expected1-False]": "AssertionError: numpy array are different", @@ -1640,28 +1639,6 @@ def pytest_unconfigure(config): "tests/extension/test_interval.py::TestIntervalArray::test_grouping_grouper": "AssertionError: ndarray Expected type , found instead", "tests/extension/test_interval.py::TestIntervalArray::test_in_numeric_groupby": "TODO: Add a reason for failure", "tests/extension/test_interval.py::TestIntervalArray::test_is_extension_array_dtype": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[BooleanDtype-c]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[BooleanDtype-python]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float32Dtype-c]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float32Dtype-python]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float64Dtype-c]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float64Dtype-python]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int16Dtype-c]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int16Dtype-python]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int32Dtype-c]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int32Dtype-python]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int64Dtype-c]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int64Dtype-python]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int8Dtype-c]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int8Dtype-python]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt16Dtype-c]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt16Dtype-python]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt32Dtype-c]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt32Dtype-python]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt64Dtype-c]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt64Dtype-python]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt8Dtype-c]": "TODO: Add a reason for failure", - "tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt8Dtype-python]": "TODO: Add a reason for failure", "tests/extension/test_masked.py::TestMaskedArrays::test_accumulate_series[UInt16Dtype-cumprod-False]": "TODO: Add a reason for failure", "tests/extension/test_masked.py::TestMaskedArrays::test_accumulate_series[UInt16Dtype-cumprod-True]": "TODO: Add a reason for failure", "tests/extension/test_masked.py::TestMaskedArrays::test_accumulate_series[UInt16Dtype-cumsum-False]": "TODO: Add a reason for failure", @@ -1908,10 +1885,6 @@ def pytest_unconfigure(config): "tests/extension/test_numpy.py::TestNumpyExtensionArray::test_value_counts_with_normalize[object]": "TODO: Add a reason for failure", "tests/extension/test_period.py::Test2DCompat::test_copy_order[2D]": "TODO: Add a reason for failure", "tests/extension/test_period.py::Test2DCompat::test_copy_order[D]": "TODO: Add a reason for failure", - "tests/extension/test_period.py::TestPeriodArray::test_EA_types[2D-c]": "TODO: Add a reason for failure", - "tests/extension/test_period.py::TestPeriodArray::test_EA_types[2D-python]": "TODO: Add a reason for failure", - "tests/extension/test_period.py::TestPeriodArray::test_EA_types[D-c]": "TODO: Add a reason for failure", - "tests/extension/test_period.py::TestPeriodArray::test_EA_types[D-python]": "TODO: Add a reason for failure", "tests/extension/test_period.py::TestPeriodArray::test_astype_own_type[2D-False]": "TODO: Add a reason for failure", "tests/extension/test_period.py::TestPeriodArray::test_astype_own_type[D-False]": "TODO: Add a reason for failure", "tests/extension/test_period.py::TestPeriodArray::test_astype_str[2D]": "AssertionError: Attributes of Series are different", @@ -1940,22 +1913,6 @@ def pytest_unconfigure(config): "tests/extension/test_sparse.py::TestSparseArray::test_unary_ufunc_dunder_equivalence[nan-absolute]": "TODO: Add a reason for failure", "tests/extension/test_sparse.py::TestSparseArray::test_unary_ufunc_dunder_equivalence[nan-negative]": "TODO: Add a reason for failure", "tests/extension/test_sparse.py::TestSparseArray::test_unary_ufunc_dunder_equivalence[nan-positive]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-False-c]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-False-python]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-True-c]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-True-python]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-False-c]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-False-python]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-True-c]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-True-python]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-False-c]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-False-python]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-True-c]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-True-python]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-False-c]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-False-python]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-True-c]": "TODO: Add a reason for failure", - "tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-True-python]": "TODO: Add a reason for failure", "tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=str[python]-False-__add__]": "AssertionError: Attributes of Series are different", "tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=str[python]-True-__add__]": "AssertionError: Attributes of Series are different", "tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=string[pyarrow]-False-__radd__]": "TODO: Add a reason for failure", @@ -2190,7 +2147,6 @@ def pytest_unconfigure(config): "tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_convert_dtypes_avoid_block_splitting": "TODO: Add a reason for failure", "tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_convert_dtypes_pyarrow_to_np_nullable": "TODO: Add a reason for failure", "tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_backend_no_conversion": "TODO: Add a reason for failure", - "tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_dtype_empty_object": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='0') are different", "tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_engine_lines_false": "TODO: Add a reason for failure", "tests/frame/methods/test_copy.py::TestCopy::test_copy_consolidates": "TODO: Add a reason for failure", "tests/frame/methods/test_count.py::TestDataFrameCount::test_count": "TODO: Add a reason for failure", @@ -4256,7 +4212,6 @@ def pytest_unconfigure(config): "tests/indexes/interval/test_constructors.py::TestFromBreaks::test_constructor_pass_closed[breaks3]": "TODO: Add a reason for failure", "tests/indexes/interval/test_constructors.py::TestFromBreaks::test_constructor_pass_closed[breaks4]": "TODO: Add a reason for failure", "tests/indexes/interval/test_constructors.py::TestFromBreaks::test_generic_errors": "TODO: Add a reason for failure", - "tests/indexes/interval/test_constructors.py::TestFromBreaks::test_left_right_dont_share_data": "TODO: Add a reason for failure", "tests/indexes/interval/test_constructors.py::TestFromTuples::test_constructor_pass_closed[breaks0]": "TODO: Add a reason for failure", "tests/indexes/interval/test_constructors.py::TestFromTuples::test_constructor_pass_closed[breaks2]": "TODO: Add a reason for failure", "tests/indexes/interval/test_constructors.py::TestFromTuples::test_constructor_pass_closed[breaks3]": "TODO: Add a reason for failure", @@ -6745,7 +6700,6 @@ def pytest_unconfigure(config): "tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data11-maindtype11-Int8-expected_other11]": "AssertionError: Attributes of Series are different", "tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data2-maindtype2-expected_default2-expected_other2]": "AssertionError: Attributes of Series are different", "tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data6-maindtype6-Int64-expected_other6]": "AssertionError: Attributes of Series are different", - "tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes_pyarrow_null": "AssertionError: Attributes of Series are different", "tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes_pyarrow_to_np_nullable": "TODO: Add a reason for failure", "tests/series/methods/test_diff.py::TestSeriesDiff::test_diff_bool": "AssertionError: Attributes of Series are different", "tests/series/methods/test_drop.py::test_drop_exception_raised[drop_labels1-0-KeyError-not found in axis]": "Failed: DID NOT RAISE ", diff --git a/python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py b/python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py index fa001ca18d4..df4f217c277 100644 --- a/python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py +++ b/python/cudf/cudf/tests/dataframe/methods/test_convert_dtypes.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 import pandas as pd import pytest @@ -45,3 +45,13 @@ def test_convert_dtypes(): with pytest.raises(NotImplementedError): # category and datetime64[ns] are not nullable gdf[non_nullable_columns].convert_dtypes().to_pandas(nullable=True) + + +def test_convert_dtypes_pyarrow_null(): + pytest.importorskip("pyarrow") + data = {"a": [None, None]} + + expected = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") + result = cudf.DataFrame(data).convert_dtypes(dtype_backend="pyarrow") + + assert_eq(result.to_pandas(), expected) diff --git a/python/cudf/cudf/tests/series/methods/test_convert_dtypes.py b/python/cudf/cudf/tests/series/methods/test_convert_dtypes.py index bcca9a3af57..d55a8a56c16 100644 --- a/python/cudf/cudf/tests/series/methods/test_convert_dtypes.py +++ b/python/cudf/cudf/tests/series/methods/test_convert_dtypes.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 import pandas as pd import pytest @@ -43,3 +43,13 @@ def test_convert_integer_false_convert_floating_true(): .to_pandas(nullable=True) ) assert_eq(result, expected) + + +def test_convert_dtypes_pyarrow_null(): + pytest.importorskip("pyarrow") + data = [None, None] + + expected = pd.Series(data).convert_dtypes(dtype_backend="pyarrow") + result = cudf.Series(data).convert_dtypes(dtype_backend="pyarrow") + + assert_eq(result.to_pandas(), expected) diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index ae2c1070b57..663ba45c74a 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -601,29 +601,6 @@ def is_arrow_null_dtype(dtype: DtypeObj) -> bool: ) -def maybe_normalize_arrow_null( - col: plc.Column, dtype: DtypeObj -) -> tuple[plc.Column, DtypeObj, DtypeObj | None]: - """Normalize ArrowDtype(pa.null()) columns for internal construction. - - For pandas nullable null types (ArrowDtype wrapping pa.null()), - the column data is normalized and the dtype is replaced with - ``np.dtype("object")`` for internal dispatch. The original dtype - is returned as ``old_dtype`` so it can be stored on the column. - - Returns - ------- - tuple of (col, dtype, old_dtype) - ``old_dtype`` is the original dtype if normalization occurred, - otherwise ``None``. - """ - from cudf.core.column.column import _normalize_types_column - - if is_arrow_null_dtype(dtype): - return _normalize_types_column(col), np.dtype("object"), dtype - return col, dtype, None - - SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: dict[np.dtype[Any], plc.types.TypeId] = { np.dtype("int8"): plc.types.TypeId.INT8, np.dtype("int16"): plc.types.TypeId.INT16,