Skip to content

Commit 6c4804e

Browse files
committed
Eliminate redundant cardinality() pass in MaxScoreBulkScorer
Signed-off-by: prithvi <prithvisivasankar@gmail.com>
1 parent fa0e704 commit 6c4804e

3 files changed

Lines changed: 225 additions & 1 deletion

File tree

lucene/CHANGES.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,9 @@ Improvements
125125

126126
Optimizations
127127
---------------------
128+
* Eliminate redundant cardinality() pass in MaxScoreBulkScorer by pre-allocating the
129+
doc/score buffer to the max window size. (Prithvi S)
130+
128131
* GITHUB#15681, GITHUB#15833: Replace pre-sized array or empty array with lambda expression to call Collection#toArray. (Zhou Hui)
129132

130133
* GITHUB#13782: Replace handwritten loops compare with Arrays.compareUnsigned in TermsEnum and TermsEnumFrame classes. (Zhou Hui)
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.benchmark.jmh;
18+
19+
import java.io.IOException;
20+
import java.util.Arrays;
21+
import java.util.SplittableRandom;
22+
import java.util.concurrent.TimeUnit;
23+
import org.apache.lucene.util.FixedBitSet;
24+
import org.apache.lucene.util.IOIntConsumer;
25+
import org.openjdk.jmh.annotations.Benchmark;
26+
import org.openjdk.jmh.annotations.BenchmarkMode;
27+
import org.openjdk.jmh.annotations.Fork;
28+
import org.openjdk.jmh.annotations.Level;
29+
import org.openjdk.jmh.annotations.Measurement;
30+
import org.openjdk.jmh.annotations.Mode;
31+
import org.openjdk.jmh.annotations.OutputTimeUnit;
32+
import org.openjdk.jmh.annotations.Param;
33+
import org.openjdk.jmh.annotations.Scope;
34+
import org.openjdk.jmh.annotations.Setup;
35+
import org.openjdk.jmh.annotations.State;
36+
import org.openjdk.jmh.annotations.Warmup;
37+
import org.openjdk.jmh.infra.Blackhole;
38+
39+
/**
40+
* Benchmark comparing bitset extraction strategies used in
41+
* MaxScoreBulkScorer.scoreInnerWindowMultipleEssentialClauses().
42+
*
43+
* <p>Three strategies are compared:
44+
*
45+
* <ol>
46+
* <li><b>oldCardinalityForEach</b>: cardinality() + forEach(lambda) + clear() — 3 passes
47+
* <li><b>newForEachNoCardinality</b>: forEach(lambda) + clear() with pre-allocated buffer — 2
48+
* passes (eliminates cardinality)
49+
* <li><b>newIntoArray</b>: intoArray() + score gather loop + clear() — single extraction pass
50+
* </ol>
51+
*
52+
* <p>Both benchmarks include the populate step (setting bits + scores) to simulate the full
53+
* inner-window lifecycle.
54+
*/
55+
@BenchmarkMode(Mode.Throughput)
56+
@OutputTimeUnit(TimeUnit.MICROSECONDS)
57+
@State(Scope.Benchmark)
58+
@Warmup(iterations = 5, time = 1)
59+
@Measurement(iterations = 5, time = 1)
60+
@Fork(
61+
value = 1,
62+
jvmArgsAppend = {"-Xmx1g", "-Xms1g", "-XX:+AlwaysPreTouch"})
63+
public class WindowExtractionBenchmark {
64+
65+
static final int INNER_WINDOW_SIZE = 1 << 12; // 4096, same as MaxScoreBulkScorer
66+
67+
/**
68+
* Number of matching documents in the window. Realistic values range from very sparse (10) to
69+
* moderately dense (2000). Multi-term boolean queries typically match 50-500 docs per window.
70+
*/
71+
@Param({"10", "50", "128", "500", "1000", "2000"})
72+
int matchCount;
73+
74+
private final SplittableRandom random = new SplittableRandom(42);
75+
76+
// Simulates MaxScoreBulkScorer's fields
77+
private FixedBitSet windowMatches;
78+
private double[] windowScores;
79+
private int innerWindowMin;
80+
81+
// Output buffers (pre-allocated to max size, like MaxScoreBulkScorer reuses them)
82+
private int[] outDocs;
83+
private double[] outScores;
84+
private int outSize;
85+
86+
// Pre-computed match positions and scores for deterministic setup
87+
private int[] matchPositions;
88+
private double[] matchScoreValues;
89+
90+
@Setup(Level.Trial)
91+
public void setupTrial() {
92+
windowMatches = new FixedBitSet(INNER_WINDOW_SIZE);
93+
windowScores = new double[INNER_WINDOW_SIZE];
94+
// +1 for denseWord2Array sentinel slot
95+
outDocs = new int[INNER_WINDOW_SIZE + 1];
96+
outScores = new double[INNER_WINDOW_SIZE + 1];
97+
outSize = 0;
98+
innerWindowMin = 100_000; // arbitrary base doc ID
99+
100+
// Pre-compute random match positions
101+
matchPositions = new int[matchCount];
102+
matchScoreValues = new double[matchCount];
103+
FixedBitSet temp = new FixedBitSet(INNER_WINDOW_SIZE);
104+
int count = 0;
105+
while (count < matchCount) {
106+
int pos = random.nextInt(INNER_WINDOW_SIZE);
107+
if (!temp.get(pos)) {
108+
temp.set(pos);
109+
matchPositions[count] = pos;
110+
matchScoreValues[count] = random.nextDouble() * 10.0;
111+
count++;
112+
}
113+
}
114+
Arrays.sort(matchPositions);
115+
}
116+
117+
/** Populate the bitset and windowScores — simulates what the essential clause collection does. */
118+
private void populateWindow() {
119+
for (int i = 0; i < matchPositions.length; i++) {
120+
int pos = matchPositions[i];
121+
windowMatches.set(pos);
122+
windowScores[pos] = matchScoreValues[i];
123+
}
124+
}
125+
126+
/**
127+
* ORIGINAL: cardinality() + forEach(lambda) + clear(). This is what the code did before any
128+
* optimization — 3 passes over the bitset.
129+
*/
130+
@Benchmark
131+
public int oldCardinalityForEach(Blackhole bh) throws IOException {
132+
populateWindow();
133+
int innerWindowSize = INNER_WINDOW_SIZE;
134+
135+
// Pass 1: count bits to pre-size buffer
136+
int card = windowMatches.cardinality(0, innerWindowSize);
137+
// In original code: docAndScoreAccBuffer.growNoCopy(card)
138+
// We simulate with pre-allocated buffer, but cardinality() cost is still measured
139+
140+
// Pass 2: forEach with lambda to extract docs + scores + zero scores
141+
outSize = 0;
142+
windowMatches.forEach(
143+
0,
144+
innerWindowSize,
145+
0,
146+
(IOIntConsumer)
147+
index -> {
148+
outDocs[outSize] = innerWindowMin + index;
149+
outScores[outSize] = windowScores[index];
150+
outSize++;
151+
windowScores[index] = 0d;
152+
});
153+
154+
// Pass 3: clear the bitset
155+
windowMatches.clear(0, innerWindowSize);
156+
157+
bh.consume(card);
158+
bh.consume(outScores);
159+
return outSize;
160+
}
161+
162+
/**
163+
* OPTIMIZED: forEach(lambda) + clear() with pre-allocated buffer. Eliminates the cardinality()
164+
* pass — 2 passes over the bitset. This is the current implementation.
165+
*/
166+
@Benchmark
167+
public int newForEachNoCardinality(Blackhole bh) throws IOException {
168+
populateWindow();
169+
int innerWindowSize = INNER_WINDOW_SIZE;
170+
171+
// No cardinality pass needed — buffer pre-allocated to INNER_WINDOW_SIZE
172+
173+
// Single extraction pass: forEach with lambda
174+
outSize = 0;
175+
windowMatches.forEach(
176+
0,
177+
innerWindowSize,
178+
0,
179+
(IOIntConsumer)
180+
index -> {
181+
outDocs[outSize] = innerWindowMin + index;
182+
outScores[outSize] = windowScores[index];
183+
outSize++;
184+
windowScores[index] = 0d;
185+
});
186+
187+
// Clear the bitset
188+
windowMatches.clear(0, innerWindowSize);
189+
190+
bh.consume(outScores);
191+
return outSize;
192+
}
193+
194+
/**
195+
* ALTERNATIVE: intoArray() + score gather loop + clear(). Uses the optimized branchless
196+
* denseWord2Array for bit extraction — best for dense windows.
197+
*/
198+
@Benchmark
199+
public int newIntoArray(Blackhole bh) {
200+
populateWindow();
201+
int innerWindowSize = INNER_WINDOW_SIZE;
202+
203+
// Single pass: extract doc IDs and get count
204+
int count = windowMatches.intoArray(0, innerWindowSize, innerWindowMin, outDocs);
205+
206+
// Gather scores using extracted indices + zero used entries
207+
for (int i = 0; i < count; ++i) {
208+
int index = outDocs[i] - innerWindowMin;
209+
outScores[i] = windowScores[index];
210+
windowScores[index] = 0d;
211+
}
212+
213+
// Clear the bitset
214+
windowMatches.clear(0, innerWindowSize);
215+
216+
bh.consume(outScores);
217+
return count;
218+
}
219+
}

lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -263,7 +263,9 @@ private void scoreInnerWindowMultipleEssentialClauses(
263263
top = essentialQueue.updateTop();
264264
} while (top.doc < innerWindowMax);
265265

266-
docAndScoreAccBuffer.growNoCopy(windowMatches.cardinality(0, innerWindowSize));
266+
// Pre-allocate to max window size to avoid a separate cardinality() pass.
267+
// This buffer is reused across windows so the allocation is a one-time cost.
268+
docAndScoreAccBuffer.growNoCopy(INNER_WINDOW_SIZE);
267269
docAndScoreAccBuffer.size = 0;
268270
windowMatches.forEach(
269271
0,

0 commit comments

Comments
 (0)