Skip to content

Commit ac19f0a

Browse files
authored
Use binary search partitioning in ReaderUtil#partitionByLeaf (#15938)
1 parent cb87f1f commit ac19f0a

3 files changed

Lines changed: 200 additions & 24 deletions

File tree

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,9 @@ Optimizations
314314

315315
* GITHUB#15954: Use SkipBlockRangeIterator as the two-phase approximation in SortedNumericDocValuesRangeQuery. (Sagar Upadhyaya)
316316

317+
* GITHUB#15938: Optimize ReaderUtil#partitionByLeaf to use binary search on leaf
318+
boundaries instead of linear scan. (Greg Miller)
319+
317320
Bug Fixes
318321
---------------------
319322
* GITHUB#15754: Fix HTMLStripCharFilter to prevent tags from incorrectly consuming subsequent
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.benchmark.jmh;
18+
19+
import java.util.Arrays;
20+
import java.util.Random;
21+
import java.util.concurrent.TimeUnit;
22+
import org.apache.lucene.util.ArrayUtil;
23+
import org.openjdk.jmh.annotations.Benchmark;
24+
import org.openjdk.jmh.annotations.BenchmarkMode;
25+
import org.openjdk.jmh.annotations.Fork;
26+
import org.openjdk.jmh.annotations.Level;
27+
import org.openjdk.jmh.annotations.Measurement;
28+
import org.openjdk.jmh.annotations.Mode;
29+
import org.openjdk.jmh.annotations.OutputTimeUnit;
30+
import org.openjdk.jmh.annotations.Param;
31+
import org.openjdk.jmh.annotations.Scope;
32+
import org.openjdk.jmh.annotations.Setup;
33+
import org.openjdk.jmh.annotations.State;
34+
import org.openjdk.jmh.annotations.Warmup;
35+
import org.openjdk.jmh.infra.Blackhole;
36+
37+
/**
38+
* Benchmark comparing partition strategies for ReaderUtil#partitionByLeaf. Both benchmarks operate
39+
* on pre-sorted doc IDs to isolate the partition step from sorting overhead.
40+
*
41+
* <ul>
42+
* <li>linearPartition: linear-scan partition (previous implementation)
43+
* <li>binarySearchPartition: binary-search partition using leaf boundaries (current
44+
* implementation)
45+
* </ul>
46+
*/
47+
@BenchmarkMode(Mode.Throughput)
48+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
49+
@State(Scope.Benchmark)
50+
@Warmup(iterations = 5, time = 1)
51+
@Measurement(iterations = 5, time = 1)
52+
@Fork(
53+
value = 3,
54+
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
55+
public class PartitionByLeafBenchmark {
56+
57+
private static final int[] EMPTY_INT_ARRAY = new int[0];
58+
59+
/** Number of doc IDs we'll be partitioning. */
60+
@Param({"100", "1000", "10000", "100000"})
61+
int numDocIds;
62+
63+
/** Number of leaves in the test index. */
64+
@Param({"5", "10", "20", "50", "200"})
65+
int numLeaves;
66+
67+
/** Pre-sorted doc IDs to partition. */
68+
private int[] sortedDocIds;
69+
70+
/** Leaf boundaries: leafDocBase[i] is the docBase for leaf i. */
71+
private int[] leafDocBase;
72+
73+
/** Max doc per leaf (uniform for simplicity). */
74+
private int docsPerLeaf;
75+
76+
@Setup(Level.Trial)
77+
public void setup() {
78+
Random r = new Random();
79+
80+
docsPerLeaf = Math.max(numDocIds / numLeaves, 1) * 10;
81+
int totalDocs = numLeaves * docsPerLeaf;
82+
83+
leafDocBase = new int[numLeaves];
84+
for (int i = 0; i < numLeaves; i++) {
85+
leafDocBase[i] = i * docsPerLeaf;
86+
}
87+
88+
// Generate unique doc IDs via shuffle
89+
int[] pool = new int[totalDocs];
90+
for (int i = 0; i < totalDocs; i++) {
91+
pool[i] = i;
92+
}
93+
for (int i = totalDocs - 1; i > 0; i--) {
94+
int j = r.nextInt(i + 1);
95+
int tmp = pool[i];
96+
pool[i] = pool[j];
97+
pool[j] = tmp;
98+
}
99+
sortedDocIds = ArrayUtil.copyOfSubArray(pool, 0, numDocIds);
100+
Arrays.sort(sortedDocIds);
101+
}
102+
103+
@Benchmark
104+
public void linearPartition(Blackhole bh) {
105+
bh.consume(partitionSortedLinear(sortedDocIds));
106+
}
107+
108+
@Benchmark
109+
public void binarySearchPartition(Blackhole bh) {
110+
bh.consume(partitionSortedBinarySearch(sortedDocIds));
111+
}
112+
113+
/**
114+
* Partition sorted doc IDs across leaves using a linear scan. This mirrors the previous
115+
* implementation in ReaderUtil#partitionByLeaf.
116+
*/
117+
private int[][] partitionSortedLinear(int[] sortedDocIds) {
118+
int[][] result = new int[numLeaves][];
119+
if (sortedDocIds.length == 0) {
120+
Arrays.fill(result, EMPTY_INT_ARRAY);
121+
return result;
122+
}
123+
int leafStart = 0;
124+
int leafIdx = 0;
125+
int leafEnd = leafDocBase[0] + docsPerLeaf;
126+
for (int i = 0; i < sortedDocIds.length; i++) {
127+
int docId = sortedDocIds[i];
128+
while (docId >= leafEnd) {
129+
int count = i - leafStart;
130+
if (count == 0) {
131+
result[leafIdx] = EMPTY_INT_ARRAY;
132+
} else {
133+
result[leafIdx] = new int[count];
134+
System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
135+
}
136+
leafStart = i;
137+
leafIdx++;
138+
leafEnd = leafDocBase[leafIdx] + docsPerLeaf;
139+
}
140+
}
141+
int count = sortedDocIds.length - leafStart;
142+
result[leafIdx] = new int[count];
143+
System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
144+
Arrays.fill(result, leafIdx + 1, numLeaves, EMPTY_INT_ARRAY);
145+
return result;
146+
}
147+
148+
/**
149+
* Partition sorted doc IDs across leaves using binary search on leaf boundaries. For each leaf,
150+
* binary search for its end boundary in the sorted doc IDs to find the slice belonging to that
151+
* leaf. Each successive search is bounded by the previous result. Includes an O(1) peek to skip
152+
* empty leaves and early termination when all docs are placed.
153+
*/
154+
private int[][] partitionSortedBinarySearch(int[] sortedDocIds) {
155+
int[][] result = new int[numLeaves][];
156+
if (sortedDocIds.length == 0) {
157+
Arrays.fill(result, EMPTY_INT_ARRAY);
158+
return result;
159+
}
160+
int from = 0;
161+
int leafIdx = 0;
162+
for (; leafIdx < numLeaves && from < sortedDocIds.length; leafIdx++) {
163+
int leafEnd = leafDocBase[leafIdx] + docsPerLeaf;
164+
if (sortedDocIds[from] >= leafEnd) {
165+
result[leafIdx] = EMPTY_INT_ARRAY;
166+
continue;
167+
}
168+
int to = Arrays.binarySearch(sortedDocIds, from, sortedDocIds.length, leafEnd);
169+
if (to < 0) {
170+
to = -to - 1;
171+
}
172+
int count = to - from;
173+
result[leafIdx] = new int[count];
174+
System.arraycopy(sortedDocIds, from, result[leafIdx], 0, count);
175+
from = to;
176+
}
177+
Arrays.fill(result, leafIdx, numLeaves, EMPTY_INT_ARRAY);
178+
return result;
179+
}
180+
}

lucene/core/src/java/org/apache/lucene/index/ReaderUtil.java

Lines changed: 17 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -114,33 +114,26 @@ public static int[][] partitionByLeaf(ScoreDoc[] hits, List<LeafReaderContext> l
114114
sortedDocIds[i] = hits[i].doc;
115115
}
116116
Arrays.sort(sortedDocIds);
117-
int leafStart = 0;
117+
int from = 0;
118118
int leafIdx = 0;
119-
LeafReaderContext leaf = leaves.getFirst();
120-
int leafEnd = leaf.docBase + leaf.reader().maxDoc();
121-
for (int i = 0; i < sortedDocIds.length; i++) {
122-
int docId = sortedDocIds[i];
123-
while (docId >= leafEnd) {
124-
int count = i - leafStart;
125-
if (count == 0) {
126-
result[leafIdx] = EMPTY_INT_ARRAY;
127-
} else {
128-
result[leafIdx] = new int[count];
129-
System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
130-
}
131-
leafStart = i;
132-
leafIdx++;
133-
leaf = leaves.get(leafIdx);
134-
leafEnd = leaf.docBase + leaf.reader().maxDoc();
119+
for (; leafIdx < numLeaves && from < sortedDocIds.length; leafIdx++) {
120+
LeafReaderContext leaf = leaves.get(leafIdx);
121+
int leafEnd = leaf.docBase + leaf.reader().maxDoc();
122+
if (sortedDocIds[from] >= leafEnd) {
123+
result[leafIdx] = EMPTY_INT_ARRAY;
124+
continue;
125+
}
126+
int to = Arrays.binarySearch(sortedDocIds, from, sortedDocIds.length, leafEnd);
127+
if (to < 0) {
128+
to = -to - 1;
135129
}
130+
int count = to - from;
131+
assert count > 0;
132+
result[leafIdx] = new int[count];
133+
System.arraycopy(sortedDocIds, from, result[leafIdx], 0, count);
134+
from = to;
136135
}
137-
// Handle remaining docIDs
138-
int count = sortedDocIds.length - leafStart;
139-
assert count > 0;
140-
result[leafIdx] = new int[count];
141-
System.arraycopy(sortedDocIds, leafStart, result[leafIdx], 0, count);
142-
// Fill remaining empty leaves
143-
Arrays.fill(result, leafIdx + 1, numLeaves, EMPTY_INT_ARRAY);
136+
Arrays.fill(result, leafIdx, numLeaves, EMPTY_INT_ARRAY);
144137
return result;
145138
}
146139
}

0 commit comments

Comments
 (0)