Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 24 additions & 12 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,11 +85,11 @@
dtype_to_pylibcudf_type,
find_common_type,
get_dtype_of_same_kind,
is_arrow_null_dtype,
is_column_like,
is_mixed_with_object_dtype,
is_pandas_nullable_extension_dtype,
is_pandas_nullable_numpy_dtype,
maybe_normalize_arrow_null,
min_signed_type,
np_dtypes_to_pandas_dtypes,
pyarrow_dtype_to_cudf_dtype,
Expand Down Expand Up @@ -378,13 +378,9 @@ def _wrap_and_validate(col: plc.Column, dtype: DtypeObj) -> plc.Column:
"Normalize to np.dtype('O') before calling "
"ColumnBase.create."
)
if isinstance(dtype, pd.ArrowDtype) and pa.types.is_null(
dtype.pyarrow_dtype
):
if is_arrow_null_dtype(dtype) and col.null_count() != col.size():
raise ValueError(
f"dtype {dtype} is a pandas nullable string dtype with all nulls. "
"Normalize to an empty string column with the same pandas StringDtype "
"before calling ColumnBase.create."
f"dtype {dtype} can only be used with all-null columns."
)

dtype_kind = dtype.kind
Expand Down Expand Up @@ -961,15 +957,11 @@ def create(
like copy-on-write. When validation is disabled, the caller is responsible for
ensuring that col and its children are already normalized and wrapped.
"""
# For pandas nullable null types (ArrowDtype wrapping pa.null()),
# normalize the column data and dtype before construction.
col, dtype, old_dtype = maybe_normalize_arrow_null(col, dtype)

# Dispatch to the appropriate subclass based on dtype
target_cls = ColumnBase._dispatch_subclass_from_dtype(dtype)
self = target_cls.__new__(target_cls)
self.plc_column = _wrap_and_validate(col, dtype) if validate else col
self._dtype = dtype if old_dtype is None else old_dtype
self._dtype = dtype
self._distinct_count = {}
self._has_nulls = {}
# The set of exposed buffers associated with this column. These buffers must be
Expand Down Expand Up @@ -1419,6 +1411,8 @@ def dropna(self) -> Self:
return self.copy()

def to_arrow(self) -> pa.Array:
if is_arrow_null_dtype(self.dtype):
return pa.nulls(len(self))
with self.access(mode="read", scope="internal"):
return _handle_nulls(
self.plc_column.to_arrow(
Expand Down Expand Up @@ -3323,6 +3317,12 @@ def as_column(
elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
if isinstance(arbitrary, pa.NullArray) and dtype is None:
dtype = np.dtype("object")
elif is_arrow_null_dtype(dtype):
if arbitrary.null_count != len(arbitrary):
raise ValueError(
f"dtype {dtype} can only be used with all-null data."
)
arbitrary = pa.nulls(len(arbitrary))
column = ColumnBase.from_arrow(arbitrary)
if nan_as_null is not False:
column = column.nans_to_nulls()
Expand Down Expand Up @@ -3536,6 +3536,11 @@ def as_column(
elif length < 0:
raise ValueError(f"{length=} must be >=0.")

if is_arrow_null_dtype(dtype):
if is_na_like(arbitrary):
return column_empty(length, dtype=dtype)
pa.scalar(arbitrary, type=dtype.pyarrow_dtype)

pa_type = None
if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype):
return as_column(
Expand Down Expand Up @@ -3768,6 +3773,13 @@ def as_column(

from_pandas = nan_as_null is None or nan_as_null
if dtype is not None:
if is_arrow_null_dtype(dtype):
arbitrary = pa.array(
arbitrary,
type=dtype.pyarrow_dtype,
from_pandas=True,
)
return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
try:
arbitrary = pa.array(
arbitrary,
Expand Down
1 change: 1 addition & 0 deletions python/cudf/cudf/core/dtype/validators.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def is_dtype_obj_string(obj: DtypeObj) -> bool:
and (
pa.types.is_string(obj.pyarrow_dtype)
or pa.types.is_large_string(obj.pyarrow_dtype)
or pa.types.is_null(obj.pyarrow_dtype)
)
)
)
Expand Down
16 changes: 10 additions & 6 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -6574,12 +6574,16 @@ def convert_dtypes(
if dtype_backend == "pyarrow":
cols = []
for col in self._columns:
arrow_dtype = pd.ArrowDtype(
pa.null()
if col.null_count == len(col)
else cudf_dtype_to_pa_type(col.dtype)
)
cols.append(ColumnBase.create(col.plc_column, arrow_dtype))
if len(col) == 0 and is_dtype_obj_string(col.dtype):
cols.append(col)
continue
if len(col) != 0 and col.null_count == len(col):
cols.append(as_column(col, dtype=pd.ArrowDtype(pa.null())))
else:
arrow_dtype = pd.ArrowDtype(
cudf_dtype_to_pa_type(col.dtype)
)
cols.append(ColumnBase.create(col.plc_column, arrow_dtype))
return self._from_data_like_self(
self._data._from_columns_like_self(cols, verify=False)
)
Expand Down
46 changes: 0 additions & 46 deletions python/cudf/cudf/pandas/scripts/conftest-patch.py
Original file line number Diff line number Diff line change
Expand Up @@ -723,7 +723,6 @@ def pytest_unconfigure(config):
"tests/base/test_constructors.py::TestConstruction::test_constructor_datetime_outofbound[Series-object-string]": "Failed: DID NOT RAISE <class 'pandas.errors.OutOfBoundsDatetime'>",
"tests/base/test_conversion.py::test_array[index-arr3-_left]": "TODO: Add a reason for failure",
"tests/base/test_conversion.py::test_array[index-arr4-_sparse_values]": "TODO: Add a reason for failure",
"tests/base/test_conversion.py::test_array[series-arr3-_left]": "TODO: Add a reason for failure",
"tests/base/test_conversion.py::test_array[series-arr4-_sparse_values]": "TODO: Add a reason for failure",
"tests/base/test_conversion.py::test_to_numpy[array-False-arr1-expected1-False]": "AssertionError: numpy array are different",
"tests/base/test_conversion.py::test_to_numpy[array-True-arr1-expected1-False]": "AssertionError: numpy array are different",
Expand Down Expand Up @@ -1640,28 +1639,6 @@ def pytest_unconfigure(config):
"tests/extension/test_interval.py::TestIntervalArray::test_grouping_grouper": "AssertionError: ndarray Expected type <class 'numpy.ndarray'>, found <class 'pandas.arrays.ArrowStringArray'> instead",
"tests/extension/test_interval.py::TestIntervalArray::test_in_numeric_groupby": "TODO: Add a reason for failure",
"tests/extension/test_interval.py::TestIntervalArray::test_is_extension_array_dtype": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[BooleanDtype-c]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[BooleanDtype-python]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float32Dtype-c]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float32Dtype-python]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float64Dtype-c]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Float64Dtype-python]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int16Dtype-c]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int16Dtype-python]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int32Dtype-c]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int32Dtype-python]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int64Dtype-c]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int64Dtype-python]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int8Dtype-c]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[Int8Dtype-python]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt16Dtype-c]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt16Dtype-python]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt32Dtype-c]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt32Dtype-python]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt64Dtype-c]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt64Dtype-python]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt8Dtype-c]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_EA_types[UInt8Dtype-python]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_accumulate_series[UInt16Dtype-cumprod-False]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_accumulate_series[UInt16Dtype-cumprod-True]": "TODO: Add a reason for failure",
"tests/extension/test_masked.py::TestMaskedArrays::test_accumulate_series[UInt16Dtype-cumsum-False]": "TODO: Add a reason for failure",
Expand Down Expand Up @@ -1908,10 +1885,6 @@ def pytest_unconfigure(config):
"tests/extension/test_numpy.py::TestNumpyExtensionArray::test_value_counts_with_normalize[object]": "TODO: Add a reason for failure",
"tests/extension/test_period.py::Test2DCompat::test_copy_order[2D]": "TODO: Add a reason for failure",
"tests/extension/test_period.py::Test2DCompat::test_copy_order[D]": "TODO: Add a reason for failure",
"tests/extension/test_period.py::TestPeriodArray::test_EA_types[2D-c]": "TODO: Add a reason for failure",
"tests/extension/test_period.py::TestPeriodArray::test_EA_types[2D-python]": "TODO: Add a reason for failure",
"tests/extension/test_period.py::TestPeriodArray::test_EA_types[D-c]": "TODO: Add a reason for failure",
"tests/extension/test_period.py::TestPeriodArray::test_EA_types[D-python]": "TODO: Add a reason for failure",
"tests/extension/test_period.py::TestPeriodArray::test_astype_own_type[2D-False]": "TODO: Add a reason for failure",
"tests/extension/test_period.py::TestPeriodArray::test_astype_own_type[D-False]": "TODO: Add a reason for failure",
"tests/extension/test_period.py::TestPeriodArray::test_astype_str[2D]": "AssertionError: Attributes of Series are different",
Expand Down Expand Up @@ -1940,22 +1913,6 @@ def pytest_unconfigure(config):
"tests/extension/test_sparse.py::TestSparseArray::test_unary_ufunc_dunder_equivalence[nan-absolute]": "TODO: Add a reason for failure",
"tests/extension/test_sparse.py::TestSparseArray::test_unary_ufunc_dunder_equivalence[nan-negative]": "TODO: Add a reason for failure",
"tests/extension/test_sparse.py::TestSparseArray::test_unary_ufunc_dunder_equivalence[nan-positive]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-False-c]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-False-python]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-True-c]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[pyarrow]-True-python]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-False-c]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-False-python]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-True-c]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=str[python]-True-python]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-False-c]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-False-python]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-True-c]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[pyarrow]-True-python]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-False-c]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-False-python]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-True-c]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_EA_types[string=string[python]-True-python]": "TODO: Add a reason for failure",
"tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=str[python]-False-__add__]": "AssertionError: Attributes of Series are different",
"tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=str[python]-True-__add__]": "AssertionError: Attributes of Series are different",
"tests/extension/test_string.py::TestStringArray::test_arith_series_with_array[string=string[pyarrow]-False-__radd__]": "TODO: Add a reason for failure",
Expand Down Expand Up @@ -2190,7 +2147,6 @@ def pytest_unconfigure(config):
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_convert_dtypes_avoid_block_splitting": "TODO: Add a reason for failure",
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_convert_dtypes_pyarrow_to_np_nullable": "TODO: Add a reason for failure",
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_backend_no_conversion": "TODO: Add a reason for failure",
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_dtype_empty_object": "AssertionError: Attributes of DataFrame.iloc[:, 0] (column name='0') are different",
"tests/frame/methods/test_convert_dtypes.py::TestConvertDtypes::test_pyarrow_engine_lines_false": "TODO: Add a reason for failure",
"tests/frame/methods/test_copy.py::TestCopy::test_copy_consolidates": "TODO: Add a reason for failure",
"tests/frame/methods/test_count.py::TestDataFrameCount::test_count": "TODO: Add a reason for failure",
Expand Down Expand Up @@ -4256,7 +4212,6 @@ def pytest_unconfigure(config):
"tests/indexes/interval/test_constructors.py::TestFromBreaks::test_constructor_pass_closed[breaks3]": "TODO: Add a reason for failure",
"tests/indexes/interval/test_constructors.py::TestFromBreaks::test_constructor_pass_closed[breaks4]": "TODO: Add a reason for failure",
"tests/indexes/interval/test_constructors.py::TestFromBreaks::test_generic_errors": "TODO: Add a reason for failure",
"tests/indexes/interval/test_constructors.py::TestFromBreaks::test_left_right_dont_share_data": "TODO: Add a reason for failure",
"tests/indexes/interval/test_constructors.py::TestFromTuples::test_constructor_pass_closed[breaks0]": "TODO: Add a reason for failure",
"tests/indexes/interval/test_constructors.py::TestFromTuples::test_constructor_pass_closed[breaks2]": "TODO: Add a reason for failure",
"tests/indexes/interval/test_constructors.py::TestFromTuples::test_constructor_pass_closed[breaks3]": "TODO: Add a reason for failure",
Expand Down Expand Up @@ -6745,7 +6700,6 @@ def pytest_unconfigure(config):
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data11-maindtype11-Int8-expected_other11]": "AssertionError: Attributes of Series are different",
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data2-maindtype2-expected_default2-expected_other2]": "AssertionError: Attributes of Series are different",
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes[True-params9-data6-maindtype6-Int64-expected_other6]": "AssertionError: Attributes of Series are different",
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes_pyarrow_null": "AssertionError: Attributes of Series are different",
"tests/series/methods/test_convert_dtypes.py::TestSeriesConvertDtypes::test_convert_dtypes_pyarrow_to_np_nullable": "TODO: Add a reason for failure",
"tests/series/methods/test_diff.py::TestSeriesDiff::test_diff_bool": "AssertionError: Attributes of Series are different",
"tests/series/methods/test_drop.py::test_drop_exception_raised[drop_labels1-0-KeyError-not found in axis]": "Failed: DID NOT RAISE <class 'KeyError'>",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
import pandas as pd
import pytest
Expand Down Expand Up @@ -45,3 +45,13 @@ def test_convert_dtypes():
with pytest.raises(NotImplementedError):
# category and datetime64[ns] are not nullable
gdf[non_nullable_columns].convert_dtypes().to_pandas(nullable=True)


def test_convert_dtypes_pyarrow_null():
pytest.importorskip("pyarrow")
data = {"a": [None, None]}

expected = pd.DataFrame(data).convert_dtypes(dtype_backend="pyarrow")
result = cudf.DataFrame(data).convert_dtypes(dtype_backend="pyarrow")

assert_eq(result.to_pandas(), expected)
12 changes: 11 additions & 1 deletion python/cudf/cudf/tests/series/methods/test_convert_dtypes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# SPDX-FileCopyrightText: Copyright (c) 2023-2025, NVIDIA CORPORATION.
# SPDX-FileCopyrightText: Copyright (c) 2023-2026, NVIDIA CORPORATION.
# SPDX-License-Identifier: Apache-2.0
import pandas as pd
import pytest
Expand Down Expand Up @@ -43,3 +43,13 @@ def test_convert_integer_false_convert_floating_true():
.to_pandas(nullable=True)
)
assert_eq(result, expected)


def test_convert_dtypes_pyarrow_null():
pytest.importorskip("pyarrow")
data = [None, None]

expected = pd.Series(data).convert_dtypes(dtype_backend="pyarrow")
result = cudf.Series(data).convert_dtypes(dtype_backend="pyarrow")

assert_eq(result.to_pandas(), expected)
23 changes: 0 additions & 23 deletions python/cudf/cudf/utils/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,29 +601,6 @@ def is_arrow_null_dtype(dtype: DtypeObj) -> bool:
)


def maybe_normalize_arrow_null(
col: plc.Column, dtype: DtypeObj
) -> tuple[plc.Column, DtypeObj, DtypeObj | None]:
"""Normalize ArrowDtype(pa.null()) columns for internal construction.

For pandas nullable null types (ArrowDtype wrapping pa.null()),
the column data is normalized and the dtype is replaced with
``np.dtype("object")`` for internal dispatch. The original dtype
is returned as ``old_dtype`` so it can be stored on the column.

Returns
-------
tuple of (col, dtype, old_dtype)
``old_dtype`` is the original dtype if normalization occurred,
otherwise ``None``.
"""
from cudf.core.column.column import _normalize_types_column

if is_arrow_null_dtype(dtype):
return _normalize_types_column(col), np.dtype("object"), dtype
return col, dtype, None


SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: dict[np.dtype[Any], plc.types.TypeId] = {
np.dtype("int8"): plc.types.TypeId.INT8,
np.dtype("int16"): plc.types.TypeId.INT16,
Expand Down
Loading