Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,12 @@ Performance improvements
- Performance improvement in :meth:`Series.iloc` and :meth:`DataFrame.iloc`
when setting datetimelike values into object-dtype data with list-like
indexers (:issue:`64250`).
- Performance improvement in :meth:`Series.isin` and :meth:`DataFrame.isin`
when ``values`` is a ``set`` or ``frozenset`` and the caller has integer
or boolean dtype (:issue:`25507`).
- Performance improvement in :meth:`Series.isin` and :meth:`DataFrame.isin`
for unsigned-integer, float, complex, and boolean callers against a
list-like of numeric values (:issue:`46485`).

.. ---------------------------------------------------------------------------
.. _whatsnew_310.bug_fixes:
Expand Down
44 changes: 42 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,28 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
f"to isin(), you passed a `{type(values).__name__}`"
)

if isinstance(values, (set, frozenset)) and len(values) > 0:
# GH#25507: for a set of values, membership can be tested directly
# via the set, avoiding an O(len(values)) materialization that
# otherwise dominates when comps is much smaller than values.
# Restrict to integer/bool comps (i.e. dtypes that cannot contain
# NaN), since Python set membership would mis-handle the case where
# both sides contain NaN values that are not identical.
if isinstance(comps, (ABCSeries, ABCIndex)):
comps_arr = comps._values
else:
comps_arr = comps
if (
isinstance(comps_arr, np.ndarray)
and comps_arr.ndim == 1
and comps_arr.dtype.kind in "iub"
):
return np.fromiter(
(item in values for item in comps_arr.tolist()),
dtype=bool,
count=comps_arr.size,
)

if not isinstance(values, (ABCIndex, ABCSeries, ABCExtensionArray, np.ndarray)):
orig_values = list(values)
values = _ensure_arraylike(orig_values, func_name="isin-targets")
Expand All @@ -529,9 +551,27 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
and not is_signed_integer_dtype(comps)
and not is_dtype_equal(values, comps)
):
# GH#46485 Use object to avoid upcast to float64 later
# GH#46485: np.result_type(int, uint) is float64, which loses
# precision for uint64 magnitudes > 2**53. Only recast to object
# in that precision-losing case — otherwise keep the numeric
# ndarray so the downstream htable dispatch can use the fast
# numeric path.
# TODO: Share with _find_common_type_compat
values = construct_1d_object_array_from_listlike(orig_values)
comps_dtype = getattr(comps, "dtype", None)
# values came from _ensure_arraylike's numeric branch, so its
# dtype is an np.dtype.
values_dtype = cast("np.dtype", values.dtype)
if not isinstance(comps_dtype, np.dtype):
needs_object = True
else:
common = np_find_common_type(values_dtype, comps_dtype)
needs_object = common.kind not in "iufcb" or (
common.kind == "f"
and values_dtype.kind in "iu"
and comps_dtype.kind in "iu"
)
if needs_object:
values = construct_1d_object_array_from_listlike(orig_values)

elif isinstance(values, ABCMultiIndex):
# Avoid raising in extract_array
Expand Down
24 changes: 24 additions & 0 deletions pandas/tests/series/methods/test_isin.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,3 +267,27 @@ def test_isin_filtering_on_iterable(data, isin):
expected_result = Series([True, True, False])

tm.assert_series_equal(result, expected_result)


@pytest.mark.parametrize("set_cls", [set, frozenset])
@pytest.mark.parametrize("dtype", ["int64", "int32", "uint8", "uint64", "bool"])
def test_isin_set_matches_list(set_cls, dtype):
# GH#25507: set membership fast path for integer/bool comps must match
# the result of the list-based path.
if dtype == "bool":
ser = Series([True, False, True, False])
targets = [True]
else:
ser = Series([1, 2, 3, 4, 5], dtype=dtype)
targets = [2, 4, 7]
expected = ser.isin(list(targets))
result = ser.isin(set_cls(targets))
tm.assert_series_equal(result, expected)


def test_isin_empty_set():
# GH#25507 set fast path must handle empty set
ser = Series([1, 2, 3])
result = ser.isin(set())
expected = Series([False, False, False])
tm.assert_series_equal(result, expected)
Loading