Skip to content

Commit cb87f1f

Browse files
authored
Use SkipBlockRangeIterator as approximation in SortedNumericDocValuesRangeQuery (#15954)
This allows conjunctions of range queries to co-ordinate their approximations using the existing two-phase iteration machinery.
1 parent 181c3ad commit cb87f1f

3 files changed

Lines changed: 327 additions & 13 deletions

File tree

lucene/CHANGES.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,8 @@ Optimizations
312312

313313
* GITHUB#15902: Improve DiversifyingChildren performance by using primitive arrays(Alessandro Benedetti)
314314

315+
* GITHUB#15954: Use SkipBlockRangeIterator as the two-phase approximation in SortedNumericDocValuesRangeQuery. (Sagar Upadhyaya)
316+
315317
Bug Fixes
316318
---------------------
317319
* GITHUB#15754: Fix HTMLStripCharFilter to prevent tags from incorrectly consuming subsequent
Lines changed: 228 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.lucene.benchmark.jmh;
18+
19+
import java.io.IOException;
20+
import java.nio.file.Files;
21+
import java.nio.file.Path;
22+
import java.util.Comparator;
23+
import java.util.Random;
24+
import java.util.concurrent.TimeUnit;
25+
import java.util.stream.Stream;
26+
import org.apache.lucene.document.Document;
27+
import org.apache.lucene.document.NumericDocValuesField;
28+
import org.apache.lucene.document.SortedNumericDocValuesField;
29+
import org.apache.lucene.index.DirectoryReader;
30+
import org.apache.lucene.index.IndexReader;
31+
import org.apache.lucene.index.IndexWriter;
32+
import org.apache.lucene.index.IndexWriterConfig;
33+
import org.apache.lucene.search.BooleanClause.Occur;
34+
import org.apache.lucene.search.BooleanQuery;
35+
import org.apache.lucene.search.IndexSearcher;
36+
import org.apache.lucene.store.Directory;
37+
import org.apache.lucene.store.MMapDirectory;
38+
import org.openjdk.jmh.annotations.Benchmark;
39+
import org.openjdk.jmh.annotations.BenchmarkMode;
40+
import org.openjdk.jmh.annotations.Fork;
41+
import org.openjdk.jmh.annotations.Level;
42+
import org.openjdk.jmh.annotations.Measurement;
43+
import org.openjdk.jmh.annotations.Mode;
44+
import org.openjdk.jmh.annotations.OutputTimeUnit;
45+
import org.openjdk.jmh.annotations.Param;
46+
import org.openjdk.jmh.annotations.Scope;
47+
import org.openjdk.jmh.annotations.Setup;
48+
import org.openjdk.jmh.annotations.State;
49+
import org.openjdk.jmh.annotations.TearDown;
50+
import org.openjdk.jmh.annotations.Warmup;
51+
52+
/**
53+
* Benchmarks BooleanQuery with multiple numeric range FILTER clauses.
54+
*
55+
* <p>Run with and without the MultiFieldDocValuesRangeQuery coordination changes to compare. To
56+
* benchmark the baseline, revert the BooleanQuery.rewrite() coordination rule and re-run.
57+
*
58+
* <p>Data patterns:
59+
*
60+
* <ul>
61+
* <li>clustered: values increase with docID (tight skip blocks, many YES/NO). Best case.
62+
* <li>mixed: field0 monotonic, field1 low-cardinality, rest random. Realistic.
63+
* <li>sorted: field0 monotonic (index sort key), rest random. Tests pre-sorted indexes.
64+
* <li>random: all fields uniform random. Worst case.
65+
* </ul>
66+
*/
67+
@State(Scope.Thread)
68+
@BenchmarkMode(Mode.Throughput)
69+
@OutputTimeUnit(TimeUnit.SECONDS)
70+
@Warmup(iterations = 3, time = 3)
71+
@Measurement(iterations = 5, time = 5)
72+
@Fork(value = 1, warmups = 1)
73+
public class MultiFieldDocValuesRangeBenchmark {
74+
75+
private static final String CLUSTERED = "clustered";
76+
private static final String MIXED = "mixed";
77+
private static final String SORTED = "sorted";
78+
private static final String RANDOM = "random";
79+
80+
private Directory dir;
81+
private IndexReader reader;
82+
private Path path;
83+
private BooleanQuery query;
84+
85+
@State(Scope.Benchmark)
86+
public static class Params {
87+
@Param({"1000000", "10000000"})
88+
public int docCount;
89+
90+
@Param({"3", "5"})
91+
public int fieldCount;
92+
93+
@Param({CLUSTERED, MIXED, RANDOM, SORTED})
94+
public String dataPattern;
95+
}
96+
97+
@Setup(Level.Trial)
98+
public void setup(Params params) throws Exception {
99+
path = Files.createTempDirectory("multiFieldBench");
100+
dir = MMapDirectory.open(path);
101+
102+
IndexWriterConfig iwc = new IndexWriterConfig();
103+
if (params.dataPattern.equals(SORTED)) {
104+
iwc.setIndexSort(
105+
new org.apache.lucene.search.Sort(
106+
new org.apache.lucene.search.SortField(
107+
"field0", org.apache.lucene.search.SortField.Type.LONG)));
108+
}
109+
110+
IndexWriter w = new IndexWriter(dir, iwc);
111+
Random r = new Random(42);
112+
113+
for (int i = 0; i < params.docCount; i++) {
114+
Document doc = new Document();
115+
for (int f = 0; f < params.fieldCount; f++) {
116+
long value = generateValue(params.dataPattern, f, i, params.docCount, r);
117+
doc.add(NumericDocValuesField.indexedField("field" + f, value));
118+
}
119+
w.addDocument(doc);
120+
}
121+
w.forceMerge(1);
122+
reader = DirectoryReader.open(w);
123+
w.close();
124+
125+
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
126+
for (int f = 0; f < params.fieldCount; f++) {
127+
long[] range = getQueryRange(params.dataPattern, f, params.docCount);
128+
bqBuilder.add(
129+
SortedNumericDocValuesField.newSlowRangeQuery("field" + f, range[0], range[1]),
130+
Occur.FILTER);
131+
}
132+
query = bqBuilder.build();
133+
}
134+
135+
private static long generateValue(
136+
String pattern, int fieldIdx, int docIdx, int docCount, Random r) {
137+
switch (pattern) {
138+
case CLUSTERED:
139+
long scale = (fieldIdx + 1) * 100L;
140+
long noise = r.nextInt(50);
141+
return (docIdx * scale / docCount) * docCount / scale * scale + noise;
142+
case MIXED:
143+
if (fieldIdx == 0) {
144+
return (long) docIdx * 1000L + r.nextInt(100);
145+
} else if (fieldIdx == 1) {
146+
return r.nextInt(20);
147+
} else {
148+
return r.nextLong(0, docCount);
149+
}
150+
case SORTED:
151+
// field0: monotonically increasing (will be the index sort key)
152+
// field1+: random values (not sorted — these benefit from coordination)
153+
if (fieldIdx == 0) {
154+
return (long) docIdx * 1000L + r.nextInt(100);
155+
} else {
156+
return r.nextLong(0, docCount);
157+
}
158+
case RANDOM:
159+
return r.nextLong(0, docCount);
160+
default:
161+
throw new IllegalArgumentException("Unknown pattern: " + pattern);
162+
}
163+
}
164+
165+
private static long[] getQueryRange(String pattern, int fieldIdx, int docCount) {
166+
switch (pattern) {
167+
case CLUSTERED:
168+
long scale = (fieldIdx + 1) * 100L;
169+
long maxVal = scale;
170+
long rangeSize = maxVal / 10;
171+
long offset = (fieldIdx * maxVal / 5);
172+
return new long[] {offset, offset + rangeSize};
173+
case MIXED:
174+
if (fieldIdx == 0) {
175+
long maxVal2 = (long) docCount * 1000L + 100;
176+
return new long[] {(long) (maxVal2 * 0.9), maxVal2};
177+
} else if (fieldIdx == 1) {
178+
return new long[] {15, 19};
179+
} else {
180+
long rangeSize2 = docCount / 10;
181+
long lower2 = (docCount - rangeSize2) / 2;
182+
return new long[] {lower2, lower2 + rangeSize2};
183+
}
184+
case SORTED:
185+
if (fieldIdx == 0) {
186+
long maxVal3 = (long) docCount * 1000L + 100;
187+
return new long[] {(long) (maxVal3 * 0.9), maxVal3};
188+
} else {
189+
long rangeSize3 = docCount / 10;
190+
long lower3 = (docCount - rangeSize3) / 2;
191+
return new long[] {lower3, lower3 + rangeSize3};
192+
}
193+
case RANDOM:
194+
long rangeSize4 = docCount / 5;
195+
long lower4 = (docCount - rangeSize4) / 2;
196+
return new long[] {lower4, lower4 + rangeSize4};
197+
default:
198+
throw new IllegalArgumentException("Unknown pattern: " + pattern);
199+
}
200+
}
201+
202+
@TearDown(Level.Trial)
203+
public void tearDown() throws Exception {
204+
reader.close();
205+
if (dir != null) {
206+
dir.close();
207+
dir = null;
208+
}
209+
if (Files.exists(path)) {
210+
try (Stream<Path> walk = Files.walk(path)) {
211+
walk.sorted(Comparator.reverseOrder())
212+
.forEach(
213+
p -> {
214+
try {
215+
Files.delete(p);
216+
} catch (IOException _) {
217+
}
218+
});
219+
}
220+
}
221+
}
222+
223+
@Benchmark
224+
public int searchMultiFieldRange() throws IOException {
225+
IndexSearcher searcher = new IndexSearcher(reader);
226+
return searcher.count(query);
227+
}
228+
}

lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java

Lines changed: 97 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
import org.apache.lucene.search.ConstantScoreScorerSupplier;
2929
import org.apache.lucene.search.ConstantScoreWeight;
3030
import org.apache.lucene.search.DocIdSetIterator;
31-
import org.apache.lucene.search.DocValuesRangeIterator;
3231
import org.apache.lucene.search.FieldExistsQuery;
3332
import org.apache.lucene.search.IndexSearcher;
3433
import org.apache.lucene.search.MatchAllDocsQuery;
@@ -40,6 +39,7 @@
4039
import org.apache.lucene.search.QueryVisitor;
4140
import org.apache.lucene.search.ScoreMode;
4241
import org.apache.lucene.search.ScorerSupplier;
42+
import org.apache.lucene.search.SkipBlockRangeIterator;
4343
import org.apache.lucene.search.Sort;
4444
import org.apache.lucene.search.TwoPhaseIterator;
4545
import org.apache.lucene.search.Weight;
@@ -136,16 +136,103 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
136136
SortedNumericDocValues values = DocValues.getSortedNumeric(context.reader(), field);
137137
final NumericDocValues singleton = DocValues.unwrapSingleton(values);
138138
final DocValuesSkipper skipper = context.reader().getDocValuesSkipper(field);
139-
TwoPhaseIterator iterator;
140-
if (singleton != null) {
141-
if (skipper != null) {
142-
final DocIdSetIterator psIterator =
143-
getDocIdSetIteratorOrNullForPrimarySort(context.reader(), singleton, skipper);
144-
if (psIterator != null) {
145-
return ConstantScoreScorerSupplier.fromIterator(
146-
psIterator, score(), scoreMode, maxDoc);
147-
}
139+
140+
if (singleton != null && skipper != null) {
141+
final DocIdSetIterator psIterator =
142+
getDocIdSetIteratorOrNullForPrimarySort(context.reader(), singleton, skipper);
143+
if (psIterator != null) {
144+
return ConstantScoreScorerSupplier.fromIterator(psIterator, score(), scoreMode, maxDoc);
148145
}
146+
}
147+
148+
TwoPhaseIterator iterator;
149+
if (skipper != null) {
150+
// Use SkipBlockRangeIterator as the approximation: block-level skip
151+
// filtering with no DV decoding. This exposes block skips to
152+
// ConjunctionDISI so that when one field's block is NO, other fields
153+
// never decode DV data for that block.
154+
final SkipBlockRangeIterator skipApprox =
155+
new SkipBlockRangeIterator(skipper, lowerValue, upperValue);
156+
iterator =
157+
new TwoPhaseIterator(skipApprox) {
158+
private int cachedBlockEnd = -1;
159+
private int cachedClassification = BLOCK_MAYBE;
160+
161+
@Override
162+
public boolean matches() throws IOException {
163+
int blockMatch = classifyBlockCached();
164+
if (blockMatch == BLOCK_YES) {
165+
return true;
166+
}
167+
if (blockMatch == BLOCK_IF_DOC_HAS_VALUE) {
168+
if (singleton != null) {
169+
return singleton.advanceExact(skipApprox.docID());
170+
} else {
171+
return values.advanceExact(skipApprox.docID());
172+
}
173+
}
174+
// MAYBE — need to decode DV and check the actual value.
175+
if (singleton != null) {
176+
if (singleton.advanceExact(skipApprox.docID())) {
177+
final long value = singleton.longValue();
178+
return value >= lowerValue && value <= upperValue;
179+
}
180+
} else {
181+
if (values.advanceExact(skipApprox.docID())) {
182+
for (int i = 0, cnt = values.docValueCount(); i < cnt; ++i) {
183+
final long value = values.nextValue();
184+
if (value < lowerValue) {
185+
continue;
186+
}
187+
return value <= upperValue;
188+
}
189+
}
190+
}
191+
return false;
192+
}
193+
194+
@Override
195+
public int docIDRunEnd() throws IOException {
196+
if (classifyBlockCached() == BLOCK_YES) {
197+
// Only report the current level-0 block as a run. The
198+
// approximation's docIDRunEnd() may expand to higher levels
199+
// that could be MAYBE, not YES.
200+
return cachedBlockEnd + 1;
201+
}
202+
return super.docIDRunEnd();
203+
}
204+
205+
@Override
206+
public float matchCost() {
207+
return 3; // advanceExact + 2 comparisons
208+
}
209+
210+
private static final int BLOCK_MAYBE = 0;
211+
private static final int BLOCK_YES = 1;
212+
private static final int BLOCK_IF_DOC_HAS_VALUE = 2;
213+
214+
private int classifyBlockCached() {
215+
int blockEnd = skipper.maxDocID(0);
216+
if (blockEnd != cachedBlockEnd) {
217+
cachedBlockEnd = blockEnd;
218+
cachedClassification = classifyBlock();
219+
}
220+
return cachedClassification;
221+
}
222+
223+
private int classifyBlock() {
224+
long blockMin = skipper.minValue(0);
225+
long blockMax = skipper.maxValue(0);
226+
if (blockMin >= lowerValue && blockMax <= upperValue) {
227+
if (skipper.docCount(0) == skipper.maxDocID(0) - skipper.minDocID(0) + 1) {
228+
return BLOCK_YES;
229+
}
230+
return BLOCK_IF_DOC_HAS_VALUE;
231+
}
232+
return BLOCK_MAYBE;
233+
}
234+
};
235+
} else if (singleton != null) {
149236
iterator =
150237
new TwoPhaseIterator(singleton) {
151238
@Override
@@ -182,9 +269,6 @@ public float matchCost() {
182269
}
183270
};
184271
}
185-
if (skipper != null) {
186-
iterator = new DocValuesRangeIterator(iterator, skipper, lowerValue, upperValue, false);
187-
}
188272
return ConstantScoreScorerSupplier.fromIterator(
189273
TwoPhaseIterator.asDocIdSetIterator(iterator), score(), scoreMode, maxDoc);
190274
}

0 commit comments

Comments
 (0)