diff --git a/doc/source/whatsnew/v3.1.0.rst b/doc/source/whatsnew/v3.1.0.rst index 3ef7906939259..d44b4717358b0 100644 --- a/doc/source/whatsnew/v3.1.0.rst +++ b/doc/source/whatsnew/v3.1.0.rst @@ -198,6 +198,7 @@ Bug fixes Categorical ^^^^^^^^^^^ - Bug in :meth:`Categorical.__repr__` where the values and categories lines could exceed ``display.width`` (:issue:`12066`) +- Bug in :meth:`Categorical.map` where unordered categoricals preserved the positional category order from the original categories instead of sorting the mapped values, causing :meth:`DataFrame.sort_values` with ``key`` to ignore custom sort orders (:issue:`58153`) - Bug in :meth:`CategoricalIndex.union` and :meth:`CategoricalIndex.intersection` giving incorrect results when the two indexes have the same unordered categories in different orders (:issue:`55335`) - Bug in :meth:`Index.fillna` raising ``TypeError`` when filling with a tuple value (e.g. on object-dtype or :class:`CategoricalIndex` with tuple categories) (:issue:`37681`) - diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 83fe0965ec123..077f30a48d788 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -1641,8 +1641,26 @@ def map( na_val = mapper(np.nan) if callable(mapper) else mapper.get(np.nan, np.nan) if new_categories.is_unique and not new_categories.hasnans and na_val is np.nan: + codes = self._codes.copy() + if not self.ordered: + # GH#58153: For unordered categoricals, sort the mapped + # categories so that category order reflects the natural + # ordering of the new values, not the positional order + # inherited from the original categories. + try: + indexer = new_categories.argsort() + except TypeError: + # Mixed types (e.g. str and float) can't be compared; + # skip sorting and keep original category order. + pass + else: + new_categories = new_categories.take(indexer) + reverse_indexer = np.empty(len(indexer), dtype=np.intp) + reverse_indexer[indexer] = np.arange(len(indexer)) + mask = codes >= 0 + codes[mask] = reverse_indexer[codes[mask]] new_dtype = CategoricalDtype(new_categories, ordered=self.ordered) - return self.from_codes(self._codes.copy(), dtype=new_dtype, validate=False) + return self.from_codes(codes, dtype=new_dtype, validate=False) if has_nans: new_categories = new_categories.insert(len(new_categories), na_val) diff --git a/pandas/tests/arrays/categorical/test_map.py b/pandas/tests/arrays/categorical/test_map.py index cfbdc2cb70eee..ef8caf0e13282 100644 --- a/pandas/tests/arrays/categorical/test_map.py +++ b/pandas/tests/arrays/categorical/test_map.py @@ -22,8 +22,12 @@ def test_map_str(data, categories, ordered, na_action): # GH 31202 - override base class since we want to maintain categorical/ordered cat = Categorical(data, categories=categories, ordered=ordered) result = cat.map(str, na_action=na_action) + expected_categories = list(map(str, categories)) + if not ordered: + # GH#58153: Unordered categoricals sort categories after map + expected_categories = sorted(expected_categories) expected = Categorical( - map(str, data), categories=map(str, categories), ordered=ordered + map(str, data), categories=expected_categories, ordered=ordered ) tm.assert_categorical_equal(result, expected) @@ -36,7 +40,8 @@ def test_map(na_action): cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False) result = cat.map(lambda x: x.lower(), na_action=na_action) - exp = Categorical(list("ababc"), categories=list("bac"), ordered=False) + # GH#58153: Unordered categoricals sort categories after map + exp = Categorical(list("ababc"), categories=list("abc"), ordered=False) tm.assert_categorical_equal(result, exp) # GH 12766: Return an index not an array @@ -51,7 +56,8 @@ def f(x): return {"A": 10, "B": 20, "C": 30}.get(x) result = cat.map(f, na_action=na_action) - exp = Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False) + # GH#58153: Unordered categoricals sort categories after map + exp = Categorical([10, 20, 10, 20, 30], categories=[10, 20, 30], ordered=False) tm.assert_categorical_equal(result, exp) mapper = Series([10, 20, 30], index=["A", "B", "C"]) diff --git a/pandas/tests/indexes/categorical/test_map.py b/pandas/tests/indexes/categorical/test_map.py index 36605a2c9990d..6a4cc68b1ea8b 100644 --- a/pandas/tests/indexes/categorical/test_map.py +++ b/pandas/tests/indexes/categorical/test_map.py @@ -22,8 +22,12 @@ def test_map_str(data, categories, ordered): # GH 31202 - override base class since we want to maintain categorical/ordered index = CategoricalIndex(data, categories=categories, ordered=ordered) result = index.map(str) + expected_categories = list(map(str, categories)) + if not ordered: + # GH#58153: Unordered categoricals sort categories after map + expected_categories = sorted(expected_categories) expected = CategoricalIndex( - map(str, data), categories=map(str, categories), ordered=ordered + map(str, data), categories=expected_categories, ordered=ordered ) tm.assert_index_equal(result, expected) @@ -38,8 +42,9 @@ def test_map(): list("ABABC"), categories=list("BAC"), ordered=False, name="XXX" ) result = ci.map(lambda x: x.lower()) + # GH#58153: Unordered categoricals sort categories after map exp = CategoricalIndex( - list("ababc"), categories=list("bac"), ordered=False, name="XXX" + list("ababc"), categories=list("abc"), ordered=False, name="XXX" ) tm.assert_index_equal(result, exp) @@ -55,7 +60,8 @@ def f(x): return {"A": 10, "B": 20, "C": 30}.get(x) result = ci.map(f) - exp = CategoricalIndex([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False) + # GH#58153: Unordered categoricals sort categories after map + exp = CategoricalIndex([10, 20, 10, 20, 30], categories=[10, 20, 30], ordered=False) tm.assert_index_equal(result, exp) result = ci.map(Series([10, 20, 30], index=["A", "B", "C"]))