Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ Performance improvements
- Performance improvement in :func:`merge` with ``how="cross"`` (:issue:`38082`)
- Performance improvement in :func:`merge` with ``how="left"`` (:issue:`64370`)
- Performance improvement in :func:`merge` with ``sort=False`` for single-key ``how="left"``/``how="right"`` joins when the opposite join key is sorted, unique, and range-like (:issue:`64146`)
- Performance improvement in :func:`merge` (non-inner), :meth:`DataFrame.unstack`, :meth:`DataFrame.pivot_table`, :func:`crosstab`, and :class:`GroupBy` aggregations by short-circuiting scans for missing-indexer sentinels
- Performance improvement in :func:`read_csv` with ``engine="c"`` when reading from binary file-like objects (e.g. PyArrow S3 file handles) by avoiding unnecessary ``TextIOWrapper`` wrapping (:issue:`46823`)
- Performance improvement in :func:`read_html` and the Python CSV parser when ``thousands`` is set, fixing catastrophic regex backtracking on cells with many comma-separated digit groups followed by non-numeric text (:issue:`52619`)
- Performance improvement in :func:`read_sas` by reading page header fields directly in Cython instead of falling back to Python (:issue:`47339`)
Expand Down
4 changes: 4 additions & 0 deletions pandas/_libs/lib.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -236,3 +236,7 @@ def is_sequence_range(
sequence: np.ndarray,
step: int, # np.ndarray[np.int64, ndim=1]
) -> bool: ...
def has_sentinel(
arr: np.ndarray, # const int6432_t[:]
sentinel: int,
) -> bool: ...
94 changes: 87 additions & 7 deletions pandas/_libs/lib.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ from numpy cimport (
complex128_t,
flatiter,
float64_t,
int8_t,
int16_t,
int32_t,
int64_t,
intp_t,
Expand Down Expand Up @@ -483,17 +485,30 @@ def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
def has_infs(const floating[:] arr) -> bool:
cdef:
Py_ssize_t i, n = len(arr)
Py_ssize_t n4 = n & ~3 # round down to multiple of 4
floating inf, neginf, val
bint ret = False

inf = np.inf
neginf = -inf
with nogil:
for i in range(n):
val = arr[i]
if val == inf or val == neginf:
for i in range(0, n4, 4):
# Bitwise | (not `or`) so all 4 lanes evaluate unconditionally,
# letting the compiler emit vectorized comparisons.
if (
(arr[i] == inf) | (arr[i] == neginf)
| (arr[i + 1] == inf) | (arr[i + 1] == neginf)
| (arr[i + 2] == inf) | (arr[i + 2] == neginf)
| (arr[i + 3] == inf) | (arr[i + 3] == neginf)
):
ret = True
break
if not ret:
for i in range(n4, n):
val = arr[i]
if val == inf or val == neginf:
ret = True
break
return ret


Expand Down Expand Up @@ -664,16 +679,27 @@ def is_range_indexer(const int6432_t[:] left, Py_ssize_t n) -> bool:
"""
cdef:
Py_ssize_t i
Py_ssize_t n4 = n & ~3
bint ret = True

if left.size != n:
return False

with nogil:
for i in range(n):
if left[i] != i:
for i in range(0, n4, 4):
if (
(left[i] != i)
| (left[i + 1] != i + 1)
| (left[i + 2] != i + 2)
| (left[i + 3] != i + 3)
):
ret = False
break
if ret:
for i in range(n4, n):
if left[i] != i:
ret = False
break
return ret


Expand All @@ -685,6 +711,7 @@ def is_sequence_range(const int6432_t[:] sequence, int64_t step) -> bool:
"""
cdef:
Py_ssize_t i, n = len(sequence)
Py_ssize_t n4 = n & ~3
int6432_t first_element
bint ret = True

Expand All @@ -694,11 +721,64 @@ def is_sequence_range(const int6432_t[:] sequence, int64_t step) -> bool:
return True

first_element = sequence[0]
# sequence[0] == first_element by construction, so the i=0 lane of the
# unrolled loop is trivially true — skipping the explicit head loop
# costs one redundant compare on the first iteration.
with nogil:
for i in range(1, n):
if sequence[i] != first_element + i * step:
for i in range(0, n4, 4):
if (
(sequence[i] != first_element + i * step)
| (sequence[i + 1] != first_element + (i + 1) * step)
| (sequence[i + 2] != first_element + (i + 2) * step)
| (sequence[i + 3] != first_element + (i + 3) * step)
):
ret = False
break
if ret:
for i in range(n4, n):
if sequence[i] != first_element + i * step:
ret = False
break
return ret


ctypedef fused signed_int_t:
int8_t
int16_t
int32_t
int64_t


@cython.wraparound(False)
@cython.boundscheck(False)
def has_sentinel(const signed_int_t[:] arr, signed_int_t sentinel) -> bool:
"""
Faster equivalent to `(arr == sentinel).any()` for integer indexers.
"""
cdef:
Py_ssize_t i, n = arr.shape[0]
Py_ssize_t n8 = n & ~7
bint ret = False

with nogil:
for i in range(0, n8, 8):
if (
(arr[i] == sentinel)
| (arr[i + 1] == sentinel)
| (arr[i + 2] == sentinel)
| (arr[i + 3] == sentinel)
| (arr[i + 4] == sentinel)
| (arr[i + 5] == sentinel)
| (arr[i + 6] == sentinel)
| (arr[i + 7] == sentinel)
):
ret = True
break
if not ret:
for i in range(n8, n):
if arr[i] == sentinel:
ret = True
break
return ret


Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -4648,7 +4648,7 @@ def igetitem(obj, i: int):

else:
ilocs = self.columns.get_indexer_non_unique(key)[0]
if (ilocs < 0).any():
if lib.has_sentinel(ilocs, -1):
# key entries not in self.columns
raise NotImplementedError

Expand Down
2 changes: 1 addition & 1 deletion pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -4144,7 +4144,7 @@ def _to_bool_indexer(indexer) -> npt.NDArray[np.bool_]:
if missing_mask.sum() > na_count:
raise KeyError(k) from None
# NaN is in k but must also be present in the data
if not (level_codes == -1).any():
if not lib.has_sentinel(level_codes, -1):
raise KeyError(k) from None
elif missing_mask.any():
raise KeyError(k) from None
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/reshape/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -1288,7 +1288,7 @@ def _maybe_add_join_keys(
left_has_missing = (
False
if left_indexer is None
else (left_indexer == -1).any()
else lib.has_sentinel(left_indexer, -1)
)

if left_has_missing:
Expand All @@ -1302,7 +1302,7 @@ def _maybe_add_join_keys(
right_has_missing = (
False
if right_indexer is None
else (right_indexer == -1).any()
else lib.has_sentinel(right_indexer, -1)
)

if right_has_missing:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -459,7 +459,7 @@ def new_index(self) -> MultiIndex | Index:
# construct the new index
if len(self.new_index_levels) == 1:
level, level_codes = self.new_index_levels[0], result_codes[0]
if (level_codes == -1).any():
if lib.has_sentinel(level_codes, -1):
level = level.insert(len(level), level._na_value)
return level.take(level_codes).rename(self.new_index_names[0])

Expand Down
4 changes: 2 additions & 2 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ def _int64_cut_off(shape) -> int:
def maybe_lift(lab, size: int) -> tuple[np.ndarray, int]:
# promote nan values (assigned -1 label in lab array)
# so that all output values are non-negative
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
return (lab + 1, size + 1) if lib.has_sentinel(lab, -1) else (lab, size)

labels = [ensure_int64(x) for x in labels]
lshape = list(shape)
Expand Down Expand Up @@ -282,7 +282,7 @@ def decons_obs_group_ids(
If nulls are excluded; i.e. -1 labels are passed through.
"""
if not xnull:
lift = np.fromiter(((a == -1).any() for a in labels), dtype=np.intp)
lift = np.fromiter((lib.has_sentinel(a, -1) for a in labels), dtype=np.intp)
arr_shape = np.asarray(shape, dtype=np.intp) + lift
shape = tuple(arr_shape)

Expand Down
Loading