1+ /*
2+ * Licensed to the Apache Software Foundation (ASF) under one or more
3+ * contributor license agreements. See the NOTICE file distributed with
4+ * this work for additional information regarding copyright ownership.
5+ * The ASF licenses this file to You under the Apache License, Version 2.0
6+ * (the "License"); you may not use this file except in compliance with
7+ * the License. You may obtain a copy of the License at
8+ *
9+ * http://www.apache.org/licenses/LICENSE-2.0
10+ *
11+ * Unless required by applicable law or agreed to in writing, software
12+ * distributed under the License is distributed on an "AS IS" BASIS,
13+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+ * See the License for the specific language governing permissions and
15+ * limitations under the License.
16+ */
17+ package org .apache .lucene .benchmark .jmh ;
18+
19+ import java .io .IOException ;
20+ import java .util .Arrays ;
21+ import java .util .SplittableRandom ;
22+ import java .util .concurrent .TimeUnit ;
23+ import org .apache .lucene .util .FixedBitSet ;
24+ import org .apache .lucene .util .IOIntConsumer ;
25+ import org .openjdk .jmh .annotations .Benchmark ;
26+ import org .openjdk .jmh .annotations .BenchmarkMode ;
27+ import org .openjdk .jmh .annotations .Fork ;
28+ import org .openjdk .jmh .annotations .Level ;
29+ import org .openjdk .jmh .annotations .Measurement ;
30+ import org .openjdk .jmh .annotations .Mode ;
31+ import org .openjdk .jmh .annotations .OutputTimeUnit ;
32+ import org .openjdk .jmh .annotations .Param ;
33+ import org .openjdk .jmh .annotations .Scope ;
34+ import org .openjdk .jmh .annotations .Setup ;
35+ import org .openjdk .jmh .annotations .State ;
36+ import org .openjdk .jmh .annotations .Warmup ;
37+ import org .openjdk .jmh .infra .Blackhole ;
38+
39+ /**
40+ * Benchmark comparing bitset extraction strategies used in
41+ * MaxScoreBulkScorer.scoreInnerWindowMultipleEssentialClauses().
42+ *
43+ * <p>Three strategies are compared:
44+ *
45+ * <ol>
46+ * <li><b>oldCardinalityForEach</b>: cardinality() + forEach(lambda) + clear() — 3 passes
47+ * <li><b>newForEachNoCardinality</b>: forEach(lambda) + clear() with pre-allocated buffer — 2
48+ * passes (eliminates cardinality)
49+ * <li><b>newIntoArray</b>: intoArray() + score gather loop + clear() — single extraction pass
50+ * </ol>
51+ *
52+ * <p>Both benchmarks include the populate step (setting bits + scores) to simulate the full
53+ * inner-window lifecycle.
54+ */
55+ @ BenchmarkMode (Mode .Throughput )
56+ @ OutputTimeUnit (TimeUnit .MICROSECONDS )
57+ @ State (Scope .Benchmark )
58+ @ Warmup (iterations = 5 , time = 1 )
59+ @ Measurement (iterations = 5 , time = 1 )
60+ @ Fork (
61+ value = 1 ,
62+ jvmArgsAppend = {"-Xmx1g" , "-Xms1g" , "-XX:+AlwaysPreTouch" })
63+ public class WindowExtractionBenchmark {
64+
65+ static final int INNER_WINDOW_SIZE = 1 << 12 ; // 4096, same as MaxScoreBulkScorer
66+
67+ /**
68+ * Number of matching documents in the window. Realistic values range from very sparse (10) to
69+ * moderately dense (2000). Multi-term boolean queries typically match 50-500 docs per window.
70+ */
71+ @ Param ({"10" , "50" , "128" , "500" , "1000" , "2000" })
72+ int matchCount ;
73+
74+ private final SplittableRandom random = new SplittableRandom (42 );
75+
76+ // Simulates MaxScoreBulkScorer's fields
77+ private FixedBitSet windowMatches ;
78+ private double [] windowScores ;
79+ private int innerWindowMin ;
80+
81+ // Output buffers (pre-allocated to max size, like MaxScoreBulkScorer reuses them)
82+ private int [] outDocs ;
83+ private double [] outScores ;
84+ private int outSize ;
85+
86+ // Pre-computed match positions and scores for deterministic setup
87+ private int [] matchPositions ;
88+ private double [] matchScoreValues ;
89+
90+ @ Setup (Level .Trial )
91+ public void setupTrial () {
92+ windowMatches = new FixedBitSet (INNER_WINDOW_SIZE );
93+ windowScores = new double [INNER_WINDOW_SIZE ];
94+ // +1 for denseWord2Array sentinel slot
95+ outDocs = new int [INNER_WINDOW_SIZE + 1 ];
96+ outScores = new double [INNER_WINDOW_SIZE + 1 ];
97+ outSize = 0 ;
98+ innerWindowMin = 100_000 ; // arbitrary base doc ID
99+
100+ // Pre-compute random match positions
101+ matchPositions = new int [matchCount ];
102+ matchScoreValues = new double [matchCount ];
103+ FixedBitSet temp = new FixedBitSet (INNER_WINDOW_SIZE );
104+ int count = 0 ;
105+ while (count < matchCount ) {
106+ int pos = random .nextInt (INNER_WINDOW_SIZE );
107+ if (!temp .get (pos )) {
108+ temp .set (pos );
109+ matchPositions [count ] = pos ;
110+ matchScoreValues [count ] = random .nextDouble () * 10.0 ;
111+ count ++;
112+ }
113+ }
114+ Arrays .sort (matchPositions );
115+ }
116+
117+ /** Populate the bitset and windowScores — simulates what the essential clause collection does. */
118+ private void populateWindow () {
119+ for (int i = 0 ; i < matchPositions .length ; i ++) {
120+ int pos = matchPositions [i ];
121+ windowMatches .set (pos );
122+ windowScores [pos ] = matchScoreValues [i ];
123+ }
124+ }
125+
126+ /**
127+ * ORIGINAL: cardinality() + forEach(lambda) + clear(). This is what the code did before any
128+ * optimization — 3 passes over the bitset.
129+ */
130+ @ Benchmark
131+ public int oldCardinalityForEach (Blackhole bh ) throws IOException {
132+ populateWindow ();
133+ int innerWindowSize = INNER_WINDOW_SIZE ;
134+
135+ // Pass 1: count bits to pre-size buffer
136+ int card = windowMatches .cardinality (0 , innerWindowSize );
137+ // In original code: docAndScoreAccBuffer.growNoCopy(card)
138+ // We simulate with pre-allocated buffer, but cardinality() cost is still measured
139+
140+ // Pass 2: forEach with lambda to extract docs + scores + zero scores
141+ outSize = 0 ;
142+ windowMatches .forEach (
143+ 0 ,
144+ innerWindowSize ,
145+ 0 ,
146+ (IOIntConsumer )
147+ index -> {
148+ outDocs [outSize ] = innerWindowMin + index ;
149+ outScores [outSize ] = windowScores [index ];
150+ outSize ++;
151+ windowScores [index ] = 0d ;
152+ });
153+
154+ // Pass 3: clear the bitset
155+ windowMatches .clear (0 , innerWindowSize );
156+
157+ bh .consume (card );
158+ bh .consume (outScores );
159+ return outSize ;
160+ }
161+
162+ /**
163+ * OPTIMIZED: forEach(lambda) + clear() with pre-allocated buffer. Eliminates the cardinality()
164+ * pass — 2 passes over the bitset. This is the current implementation.
165+ */
166+ @ Benchmark
167+ public int newForEachNoCardinality (Blackhole bh ) throws IOException {
168+ populateWindow ();
169+ int innerWindowSize = INNER_WINDOW_SIZE ;
170+
171+ // No cardinality pass needed — buffer pre-allocated to INNER_WINDOW_SIZE
172+
173+ // Single extraction pass: forEach with lambda
174+ outSize = 0 ;
175+ windowMatches .forEach (
176+ 0 ,
177+ innerWindowSize ,
178+ 0 ,
179+ (IOIntConsumer )
180+ index -> {
181+ outDocs [outSize ] = innerWindowMin + index ;
182+ outScores [outSize ] = windowScores [index ];
183+ outSize ++;
184+ windowScores [index ] = 0d ;
185+ });
186+
187+ // Clear the bitset
188+ windowMatches .clear (0 , innerWindowSize );
189+
190+ bh .consume (outScores );
191+ return outSize ;
192+ }
193+
194+ /**
195+ * ALTERNATIVE: intoArray() + score gather loop + clear(). Uses the optimized branchless
196+ * denseWord2Array for bit extraction — best for dense windows.
197+ */
198+ @ Benchmark
199+ public int newIntoArray (Blackhole bh ) {
200+ populateWindow ();
201+ int innerWindowSize = INNER_WINDOW_SIZE ;
202+
203+ // Single pass: extract doc IDs and get count
204+ int count = windowMatches .intoArray (0 , innerWindowSize , innerWindowMin , outDocs );
205+
206+ // Gather scores using extracted indices + zero used entries
207+ for (int i = 0 ; i < count ; ++i ) {
208+ int index = outDocs [i ] - innerWindowMin ;
209+ outScores [i ] = windowScores [index ];
210+ windowScores [index ] = 0d ;
211+ }
212+
213+ // Clear the bitset
214+ windowMatches .clear (0 , innerWindowSize );
215+
216+ bh .consume (outScores );
217+ return count ;
218+ }
219+ }
0 commit comments