Use SkipBlockRangeIterator as approximation in SortedNumericDocValuesRangeQuery (#15954)

sgup432 · web-flow · commit cb87f1f45bae · 2026-04-16T11:02:51.000+01:00
This allows conjunctions of range queries to co-ordinate their 
approximations using the existing two-phase iteration machinery.
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -312,6 +312,8 @@ Optimizations
 
 * GITHUB#15902: Improve DiversifyingChildren performance by using primitive arrays(Alessandro Benedetti)
 
+* GITHUB#15954: Use SkipBlockRangeIterator as the two-phase approximation in SortedNumericDocValuesRangeQuery. (Sagar Upadhyaya)
+
 Bug Fixes
 ---------------------
 * GITHUB#15754: Fix HTMLStripCharFilter to prevent tags from incorrectly consuming subsequent
diff --git a/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/MultiFieldDocValuesRangeBenchmark.java b/lucene/benchmark-jmh/src/java/org/apache/lucene/benchmark/jmh/MultiFieldDocValuesRangeBenchmark.java
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.benchmark.jmh;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Comparator;
+import java.util.Random;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Stream;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.NumericDocValuesField;
+import org.apache.lucene.document.SortedNumericDocValuesField;
+import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.MMapDirectory;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.TearDown;
+import org.openjdk.jmh.annotations.Warmup;
+
+/**
+ * Benchmarks BooleanQuery with multiple numeric range FILTER clauses.
+ *
+ * <p>Run with and without the MultiFieldDocValuesRangeQuery coordination changes to compare. To
+ * benchmark the baseline, revert the BooleanQuery.rewrite() coordination rule and re-run.
+ *
+ * <p>Data patterns:
+ *
+ * <ul>
+ *   <li>clustered: values increase with docID (tight skip blocks, many YES/NO). Best case.
+ *   <li>mixed: field0 monotonic, field1 low-cardinality, rest random. Realistic.
+ *   <li>sorted: field0 monotonic (index sort key), rest random. Tests pre-sorted indexes.
+ *   <li>random: all fields uniform random. Worst case.
+ * </ul>
+ */
+@State(Scope.Thread)
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Warmup(iterations = 3, time = 3)
+@Measurement(iterations = 5, time = 5)
+@Fork(value = 1, warmups = 1)
+public class MultiFieldDocValuesRangeBenchmark {
+
+  private static final String CLUSTERED = "clustered";
+  private static final String MIXED = "mixed";
+  private static final String SORTED = "sorted";
+  private static final String RANDOM = "random";
+
+  private Directory dir;
+  private IndexReader reader;
+  private Path path;
+  private BooleanQuery query;
+
+  @State(Scope.Benchmark)
+  public static class Params {
+    @Param({"1000000", "10000000"})
+    public int docCount;
+
+    @Param({"3", "5"})
+    public int fieldCount;
+
+    @Param({CLUSTERED, MIXED, RANDOM, SORTED})
+    public String dataPattern;
+  }
+
+  @Setup(Level.Trial)
+  public void setup(Params params) throws Exception {
+    path = Files.createTempDirectory("multiFieldBench");
+    dir = MMapDirectory.open(path);
+
+    IndexWriterConfig iwc = new IndexWriterConfig();
+    if (params.dataPattern.equals(SORTED)) {
+      iwc.setIndexSort(
+          new org.apache.lucene.search.Sort(
+              new org.apache.lucene.search.SortField(
+                  "field0", org.apache.lucene.search.SortField.Type.LONG)));
+    }
+
+    IndexWriter w = new IndexWriter(dir, iwc);
+    Random r = new Random(42);
+
+    for (int i = 0; i < params.docCount; i++) {
+      Document doc = new Document();
+      for (int f = 0; f < params.fieldCount; f++) {
+        long value = generateValue(params.dataPattern, f, i, params.docCount, r);
+        doc.add(NumericDocValuesField.indexedField("field" + f, value));
+      }
+      w.addDocument(doc);
+    }
+    w.forceMerge(1);
+    reader = DirectoryReader.open(w);
+    w.close();
+
+    BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
+    for (int f = 0; f < params.fieldCount; f++) {
+      long[] range = getQueryRange(params.dataPattern, f, params.docCount);
+      bqBuilder.add(
+          SortedNumericDocValuesField.newSlowRangeQuery("field" + f, range[0], range[1]),
+          Occur.FILTER);
+    }
+    query = bqBuilder.build();
+  }
+
+  private static long generateValue(
+      String pattern, int fieldIdx, int docIdx, int docCount, Random r) {
+    switch (pattern) {
+      case CLUSTERED:
+        long scale = (fieldIdx + 1) * 100L;
+        long noise = r.nextInt(50);
+        return (docIdx * scale / docCount) * docCount / scale * scale + noise;
+      case MIXED:
+        if (fieldIdx == 0) {
+          return (long) docIdx * 1000L + r.nextInt(100);
+        } else if (fieldIdx == 1) {
+          return r.nextInt(20);
+        } else {
+          return r.nextLong(0, docCount);
+        }
+      case SORTED:
+        // field0: monotonically increasing (will be the index sort key)
+        // field1+: random values (not sorted — these benefit from coordination)
+        if (fieldIdx == 0) {
+          return (long) docIdx * 1000L + r.nextInt(100);
+        } else {
+          return r.nextLong(0, docCount);
+        }
+      case RANDOM:
+        return r.nextLong(0, docCount);
+      default:
+        throw new IllegalArgumentException("Unknown pattern: " + pattern);
+    }
+  }
+
+  private static long[] getQueryRange(String pattern, int fieldIdx, int docCount) {
+    switch (pattern) {
+      case CLUSTERED:
+        long scale = (fieldIdx + 1) * 100L;
+        long maxVal = scale;
+        long rangeSize = maxVal / 10;
+        long offset = (fieldIdx * maxVal / 5);
+        return new long[] {offset, offset + rangeSize};
+      case MIXED:
+        if (fieldIdx == 0) {
+          long maxVal2 = (long) docCount * 1000L + 100;
+          return new long[] {(long) (maxVal2 * 0.9), maxVal2};
+        } else if (fieldIdx == 1) {
+          return new long[] {15, 19};
+        } else {
+          long rangeSize2 = docCount / 10;
+          long lower2 = (docCount - rangeSize2) / 2;
+          return new long[] {lower2, lower2 + rangeSize2};
+        }
+      case SORTED:
+        if (fieldIdx == 0) {
+          long maxVal3 = (long) docCount * 1000L + 100;
+          return new long[] {(long) (maxVal3 * 0.9), maxVal3};
+        } else {
+          long rangeSize3 = docCount / 10;
+          long lower3 = (docCount - rangeSize3) / 2;
+          return new long[] {lower3, lower3 + rangeSize3};
+        }
+      case RANDOM:
+        long rangeSize4 = docCount / 5;
+        long lower4 = (docCount - rangeSize4) / 2;
+        return new long[] {lower4, lower4 + rangeSize4};
+      default:
+        throw new IllegalArgumentException("Unknown pattern: " + pattern);
+    }
+  }
+
+  @TearDown(Level.Trial)
+  public void tearDown() throws Exception {
+    reader.close();
+    if (dir != null) {
+      dir.close();
+      dir = null;
+    }
+    if (Files.exists(path)) {
+      try (Stream<Path> walk = Files.walk(path)) {
+        walk.sorted(Comparator.reverseOrder())
+            .forEach(
+                p -> {
+                  try {
+                    Files.delete(p);
+                  } catch (IOException _) {
+                  }
+                });
+      }
+    }
+  }
+
+  @Benchmark
+  public int searchMultiFieldRange() throws IOException {
+    IndexSearcher searcher = new IndexSearcher(reader);
+    return searcher.count(query);
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java b/lucene/core/src/java/org/apache/lucene/document/SortedNumericDocValuesRangeQuery.java
@@ -28,7 +28,6 @@
 import org.apache.lucene.search.ConstantScoreScorerSupplier;
 import org.apache.lucene.search.ConstantScoreWeight;
 import org.apache.lucene.search.DocIdSetIterator;
-import org.apache.lucene.search.DocValuesRangeIterator;
 import org.apache.lucene.search.FieldExistsQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchAllDocsQuery;
@@ -40,6 +39,7 @@
 import org.apache.lucene.search.QueryVisitor;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.search.ScorerSupplier;
+import org.apache.lucene.search.SkipBlockRangeIterator;
 import org.apache.lucene.search.Sort;
 import org.apache.lucene.search.TwoPhaseIterator;
 import org.apache.lucene.search.Weight;
@@ -136,16 +136,103 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
         SortedNumericDocValues values = DocValues.getSortedNumeric(context.reader(), field);
         final NumericDocValues singleton = DocValues.unwrapSingleton(values);
         final DocValuesSkipper skipper = context.reader().getDocValuesSkipper(field);
-        TwoPhaseIterator iterator;
-        if (singleton != null) {
-          if (skipper != null) {
-            final DocIdSetIterator psIterator =
-                getDocIdSetIteratorOrNullForPrimarySort(context.reader(), singleton, skipper);
-            if (psIterator != null) {
-              return ConstantScoreScorerSupplier.fromIterator(
-                  psIterator, score(), scoreMode, maxDoc);
-            }
+
+        if (singleton != null && skipper != null) {
+          final DocIdSetIterator psIterator =
+              getDocIdSetIteratorOrNullForPrimarySort(context.reader(), singleton, skipper);
+          if (psIterator != null) {
+            return ConstantScoreScorerSupplier.fromIterator(psIterator, score(), scoreMode, maxDoc);
           }
+        }
+
+        TwoPhaseIterator iterator;
+        if (skipper != null) {
+          // Use SkipBlockRangeIterator as the approximation: block-level skip
+          // filtering with no DV decoding. This exposes block skips to
+          // ConjunctionDISI so that when one field's block is NO, other fields
+          // never decode DV data for that block.
+          final SkipBlockRangeIterator skipApprox =
+              new SkipBlockRangeIterator(skipper, lowerValue, upperValue);
+          iterator =
+              new TwoPhaseIterator(skipApprox) {
+                private int cachedBlockEnd = -1;
+                private int cachedClassification = BLOCK_MAYBE;
+
+                @Override
+                public boolean matches() throws IOException {
+                  int blockMatch = classifyBlockCached();
+                  if (blockMatch == BLOCK_YES) {
+                    return true;
+                  }
+                  if (blockMatch == BLOCK_IF_DOC_HAS_VALUE) {
+                    if (singleton != null) {
+                      return singleton.advanceExact(skipApprox.docID());
+                    } else {
+                      return values.advanceExact(skipApprox.docID());
+                    }
+                  }
+                  // MAYBE — need to decode DV and check the actual value.
+                  if (singleton != null) {
+                    if (singleton.advanceExact(skipApprox.docID())) {
+                      final long value = singleton.longValue();
+                      return value >= lowerValue && value <= upperValue;
+                    }
+                  } else {
+                    if (values.advanceExact(skipApprox.docID())) {
+                      for (int i = 0, cnt = values.docValueCount(); i < cnt; ++i) {
+                        final long value = values.nextValue();
+                        if (value < lowerValue) {
+                          continue;
+                        }
+                        return value <= upperValue;
+                      }
+                    }
+                  }
+                  return false;
+                }
+
+                @Override
+                public int docIDRunEnd() throws IOException {
+                  if (classifyBlockCached() == BLOCK_YES) {
+                    // Only report the current level-0 block as a run. The
+                    // approximation's docIDRunEnd() may expand to higher levels
+                    // that could be MAYBE, not YES.
+                    return cachedBlockEnd + 1;
+                  }
+                  return super.docIDRunEnd();
+                }
+
+                @Override
+                public float matchCost() {
+                  return 3; // advanceExact + 2 comparisons
+                }
+
+                private static final int BLOCK_MAYBE = 0;
+                private static final int BLOCK_YES = 1;
+                private static final int BLOCK_IF_DOC_HAS_VALUE = 2;
+
+                private int classifyBlockCached() {
+                  int blockEnd = skipper.maxDocID(0);
+                  if (blockEnd != cachedBlockEnd) {
+                    cachedBlockEnd = blockEnd;
+                    cachedClassification = classifyBlock();
+                  }
+                  return cachedClassification;
+                }
+
+                private int classifyBlock() {
+                  long blockMin = skipper.minValue(0);
+                  long blockMax = skipper.maxValue(0);
+                  if (blockMin >= lowerValue && blockMax <= upperValue) {
+                    if (skipper.docCount(0) == skipper.maxDocID(0) - skipper.minDocID(0) + 1) {
+                      return BLOCK_YES;
+                    }
+                    return BLOCK_IF_DOC_HAS_VALUE;
+                  }
+                  return BLOCK_MAYBE;
+                }
+              };
+        } else if (singleton != null) {
           iterator =
               new TwoPhaseIterator(singleton) {
                 @Override
@@ -182,9 +269,6 @@ public float matchCost() {
                 }
               };
         }
-        if (skipper != null) {
-          iterator = new DocValuesRangeIterator(iterator, skipper, lowerValue, upperValue, false);
-        }
         return ConstantScoreScorerSupplier.fromIterator(
             TwoPhaseIterator.asDocIdSetIterator(iterator), score(), scoreMode, maxDoc);
       }