diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index 3ef7906939259..8a2e8ec9200b7 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -198,6 +198,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.__repr__` where the values and categories lines could exceed ``display.width`` (:issue:`12066`) +- Bug in :meth:`Categorical.map` where unordered categoricals preserved the positional category order from the original categories instead of sorting the mapped values, causing :meth:`DataFrame.sort_values` with ``key`` to ignore custom sort orders (:issue:`58153`) - Bug in :meth:`CategoricalIndex.union` and :meth:`CategoricalIndex.intersection` giving incorrect results when the two indexes have the same unordered categories in different orders (:issue:`55335`) - Bug in :meth:`Index.fillna` raising ``TypeError`` when filling with a tuple value (e.g. on object-dtype or :class:`CategoricalIndex` with tuple categories) (:issue:`37681`) - @@ -244,6 +245,7 @@ Conversion ^^^^^^^^^^ - Bug in :class:`DataFrame` constructor where ``NaT`` in a :class:`TimedeltaIndex` row was incorrectly inferred as ``datetime64`` instead of ``timedelta64`` (:issue:`23985`) - Bug in :class:`DataFrame` constructor where constructing from a list of uniform-dtype arrays (e.g. pyarrow, :class:`CategoricalDtype`, nullable dtypes) lost the dtype (:issue:`49593`) +- Bug in :func:`pd.array` raising ``ArrowTypeError`` when constructing an :class:`ArrowDtype` string array from a sequence containing ``np.nan`` (:issue:`64578`) - Bug in :func:`pd.array` silently converting NaN to a nonsensical integer when given float data containing NaN and a NumPy integer dtype (:issue:`41724`) - Fixed :func:`pandas.array` to preserve mask information when converting NumPy masked arrays, converting masked values to missing values (:issue:`63879`). - Fixed bug in :meth:`DataFrame.from_records` where ``exclude`` was ignored when ``data`` was an iterator and ``nrows=0`` (:issue:`63774`) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index b20599f5bc921..70597f6839e3d 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -650,7 +650,10 @@ def _box_pa_array( mask = None if is_nan_na(): try: - arr_value = np.asarray(value) + # GH#64578: use dtype=object to preserve scalar types + # (e.g. np.nan stays float, not coerced to string 'nan') + # so that isna() correctly identifies NA entries. + arr_value = np.asarray(value, dtype=object) if arr_value.ndim > 1: # e.g. test_fixed_size_list we have list data. ndim > 1 # means there were no scalar (NA) entries. diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 959e4d452c47f..a4facf23f7c49 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3075,6 +3075,26 @@ def test_from_sequence_of_strings_boolean(): ArrowExtensionArray._from_sequence_of_strings(strings, dtype=dtype) +def test_arrow_array_constructor_with_nan(): + # GH#64578: pd.array with ArrowDtype should treat np.nan as missing value + # regardless of the element dtype of the sequence. + import pyarrow as pa + + result = pd.array(["a", np.nan], dtype=ArrowDtype(pa.string())) + expected = pd.array(["a", None], dtype=ArrowDtype(pa.string())) + tm.assert_extension_array_equal(result, expected) + + # Also verify with large_string + result2 = pd.array(["a", np.nan], dtype=ArrowDtype(pa.large_string())) + expected2 = pd.array(["a", None], dtype=ArrowDtype(pa.large_string())) + tm.assert_extension_array_equal(result2, expected2) + + # Mixed int/nan should still work + result3 = pd.array([1, np.nan, 3], dtype=ArrowDtype(pa.float64())) + expected3 = pd.array([1, None, 3], dtype=ArrowDtype(pa.float64())) + tm.assert_extension_array_equal(result3, expected3) + + def test_concat_empty_arrow_backed_series(dtype): # GH#51734 ser = pd.Series([], dtype=dtype)