Skip to content

Commit 2aa8442

Browse files
authored
Merge branch 'apache:main' into optimize-maxscore-eliminate-cardinality
2 parents 3186e2f + 34c9495 commit 2aa8442

34 files changed

Lines changed: 681 additions & 296 deletions

File tree

lucene/CHANGES.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,12 @@ API Changes
8282
To force a classic DFA execution, use Operations.determinize() and AutomatonQuery.
8383
(Dimitris Rempapis)
8484

85+
* GITHUB#15961: Removed WildcardQuery constructor overloads that exposed determinization controls
86+
(determinizeWorkLimit). Determinization is now done only as-needed.
87+
To force a classic DFA execution, use Operations.determinize() and AutomatonQuery.
88+
Also removed QueryParserBase.setDeterminizeWorkLimit/getDeterminizeWorkLimit, which are
89+
no longer needed. (Dimitris Rempapis)
90+
8591
New Features
8692
---------------------
8793
* GITHUB#15505: Upgrade snowball to 2d2e312df56f2ede014a4ffb3e91e6dea43c24be. New stemmer: PolishStemmer (and
@@ -298,6 +304,8 @@ Improvements
298304

299305
Optimizations
300306
---------------------
307+
* GITHUB#15861: Optimise PhraseScorer by short circuiting non competitive documents in TOP_SCORES mode. (Prithvi S)
308+
301309
* GITHUB#15637: Lazily allocate ByteArrayDataInputs in SegmentTermsEnumFrame#15637 (Misha Dmitriev)
302310

303311
* GITHUB#15718 Skip per-document stored field reads on sorted indices when no stored fields are present (Francisco Fernández Castaño)
@@ -389,6 +397,8 @@ Other
389397

390398
* GITHUB#15951: Fix WindowsFS onClose race condition (Szymon Bialkowski)
391399

400+
* GITHUB#15960: Move parent field from DWPT to IndexingChain. (Tim Brooks)
401+
392402
======================= Lucene 10.4.0 =======================
393403

394404
API Changes

lucene/MIGRATE.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -159,9 +159,9 @@ iwc.getConfig().getCodec().compoundFormat().getShouldUseCompoundFile();
159159
iwc.getConfig().getCodec().compoundFormat().getMaxCFSSegmentSizeMB();
160160
```
161161

162-
### Implicit determinization removed from RegexpQuery
162+
### Implicit determinization removed from RegexpQuery and WildcardQuery
163163

164-
Previously, RegexpQuery would use DFA execution by default, even if it might be inefficient.
164+
Previously, RegexpQuery and WildcardQuery would use DFA execution by default, even if it might be inefficient.
165165

166166
RegexpQuery will now only [determinize as-needed](https://swtch.com/~rsc/regexp/regexp1.html). This might be
167167
faster or slower depending upon your queries.
@@ -174,6 +174,18 @@ Automaton dfa = Operations.determinize(new RegExp(re).toAutomaton(), 10000);
174174
Query query = new AutomatonQuery(new Term("myfield", re), dfa);
175175
```
176176

177+
Similarly for WildcardQuery, the `determinizeWorkLimit` parameter has been removed from `WildcardQuery` constructors and from
178+
`WildcardQuery.toAutomaton`. `QueryParserBase.setDeterminizeWorkLimit` and `getDeterminizeWorkLimit`
179+
have also been removed.
180+
181+
To force the previous behavior, use:
182+
183+
```java
184+
String pattern = "foo*bar";
185+
Automaton dfa = Operations.determinize(WildcardQuery.toAutomaton(new Term("myfield", pattern)), 10000);
186+
Query query = new AutomatonQuery(new Term("myfield", pattern), dfa);
187+
```
188+
177189
### CollectionStatistics and TermStatistics have been renamed to FieldStats and TermStats (GITHUB#15929)
178190

179191
Corresponding methods and parameters have been renamed accordingly.
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.lucene.benchmark.jmh;
19+
20+
import java.io.IOException;
21+
import java.util.concurrent.TimeUnit;
22+
import org.apache.lucene.document.Document;
23+
import org.apache.lucene.document.Field;
24+
import org.apache.lucene.document.TextField;
25+
import org.apache.lucene.index.DirectoryReader;
26+
import org.apache.lucene.index.IndexReader;
27+
import org.apache.lucene.index.IndexWriter;
28+
import org.apache.lucene.index.IndexWriterConfig;
29+
import org.apache.lucene.search.IndexSearcher;
30+
import org.apache.lucene.search.PhraseQuery;
31+
import org.apache.lucene.search.Sort;
32+
import org.apache.lucene.search.TopDocs;
33+
import org.apache.lucene.search.TopFieldCollectorManager;
34+
import org.apache.lucene.search.TopScoreDocCollectorManager;
35+
import org.apache.lucene.store.Directory;
36+
import org.apache.lucene.store.MMapDirectory;
37+
import org.openjdk.jmh.annotations.Benchmark;
38+
import org.openjdk.jmh.annotations.BenchmarkMode;
39+
import org.openjdk.jmh.annotations.Fork;
40+
import org.openjdk.jmh.annotations.Level;
41+
import org.openjdk.jmh.annotations.Measurement;
42+
import org.openjdk.jmh.annotations.Mode;
43+
import org.openjdk.jmh.annotations.OutputTimeUnit;
44+
import org.openjdk.jmh.annotations.Scope;
45+
import org.openjdk.jmh.annotations.Setup;
46+
import org.openjdk.jmh.annotations.State;
47+
import org.openjdk.jmh.annotations.TearDown;
48+
import org.openjdk.jmh.annotations.Warmup;
49+
50+
@BenchmarkMode(Mode.Throughput)
51+
@OutputTimeUnit(TimeUnit.MILLISECONDS)
52+
@State(Scope.Benchmark)
53+
@Warmup(iterations = 5, time = 3)
54+
@Measurement(iterations = 10, time = 5)
55+
@Fork(
56+
value = 3,
57+
jvmArgsAppend = {"-Xmx2g", "-Xms2g"})
58+
public class PhraseScorerBenchmark {
59+
60+
private static final int NUM_HITS = 10;
61+
62+
private Directory dir;
63+
private IndexReader reader;
64+
private IndexSearcher searcher;
65+
private PhraseQuery exactQuery;
66+
private PhraseQuery sloppyQuery;
67+
68+
@Setup(Level.Trial)
69+
public void setUp() throws IOException {
70+
dir = new MMapDirectory(java.nio.file.Files.createTempDirectory("benchmark"));
71+
IndexWriterConfig config = new IndexWriterConfig();
72+
try (IndexWriter writer = new IndexWriter(dir, config)) {
73+
// Create a corpus where most docs contain the individual query terms but only a small
74+
// fraction contain the actual phrase. This maximises the number of documents whose maxFreq
75+
// upper-bound check allows short-circuiting.
76+
for (int i = 0; i < 1_000_000; i++) {
77+
Document doc = new Document();
78+
if (i % 1000 == 0) {
79+
// 0.1% of docs: exact phrase match
80+
doc.add(
81+
new TextField(
82+
"text", "the quick brown fox jumped over the lazy dog", Field.Store.NO));
83+
} else if (i % 2 == 0) {
84+
// 50% of docs: terms present but not as a phrase (high freq, no match)
85+
StringBuilder sb = new StringBuilder("quick ");
86+
for (int j = 0; j < 100; j++) sb.append("padding ");
87+
sb.append("fox");
88+
doc.add(new TextField("text", sb.toString(), Field.Store.NO));
89+
} else {
90+
// 50% of docs: no query terms at all
91+
doc.add(new TextField("text", "unrelated words", Field.Store.NO));
92+
}
93+
writer.addDocument(doc);
94+
}
95+
}
96+
reader = DirectoryReader.open(dir);
97+
searcher = new IndexSearcher(reader);
98+
exactQuery = new PhraseQuery("text", "quick", "brown", "fox");
99+
sloppyQuery = new PhraseQuery(10, "text", "quick", "fox");
100+
}
101+
102+
@TearDown(Level.Trial)
103+
public void tearDown() throws IOException {
104+
reader.close();
105+
dir.close();
106+
}
107+
108+
@Benchmark
109+
public TopDocs benchmarkExactTopScores() throws IOException {
110+
return searcher.search(exactQuery, 10);
111+
}
112+
113+
@Benchmark
114+
public TopDocs benchmarkSloppyTopScores() throws IOException {
115+
return searcher.search(sloppyQuery, 10);
116+
}
117+
118+
@Benchmark
119+
public TopDocs benchmarkExactComplete() throws IOException {
120+
return searcher.search(
121+
exactQuery, new TopScoreDocCollectorManager(NUM_HITS, Integer.MAX_VALUE));
122+
}
123+
124+
@Benchmark
125+
public TopDocs benchmarkExactCompleteNoScores() throws IOException {
126+
return searcher.search(
127+
exactQuery, new TopFieldCollectorManager(Sort.INDEXORDER, NUM_HITS, Integer.MAX_VALUE));
128+
}
129+
130+
@Benchmark
131+
public TopDocs benchmarkSloppyComplete() throws IOException {
132+
return searcher.search(
133+
sloppyQuery, new TopScoreDocCollectorManager(NUM_HITS, Integer.MAX_VALUE));
134+
}
135+
136+
@Benchmark
137+
public TopDocs benchmarkSloppyCompleteNoScores() throws IOException {
138+
return searcher.search(
139+
sloppyQuery, new TopFieldCollectorManager(Sort.INDEXORDER, NUM_HITS, Integer.MAX_VALUE));
140+
}
141+
}

lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiQueryMaker.java

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,6 @@
3131
import org.apache.lucene.search.MultiTermQuery;
3232
import org.apache.lucene.search.Query;
3333
import org.apache.lucene.search.WildcardQuery;
34-
import org.apache.lucene.util.automaton.Operations;
3534

3635
/**
3736
* A QueryMaker that uses common and uncommon actual Wikipedia queries for searching the English
@@ -132,10 +131,7 @@ public class EnwikiQueryMaker extends AbstractQueryMaker {
132131

133132
private static Query[] getPrebuiltQueries(String field) {
134133
WildcardQuery wcq =
135-
new WildcardQuery(
136-
new Term(field, "fo*"),
137-
Operations.DEFAULT_DETERMINIZE_WORK_LIMIT,
138-
MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE);
134+
new WildcardQuery(new Term(field, "fo*"), MultiTermQuery.CONSTANT_SCORE_BLENDED_REWRITE);
139135
// be wary of unanalyzed text
140136
return new Query[] {
141137
new SpanFirstQuery(new SpanTermQuery(new Term(field, "ford")), 5),

lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThread.java

Lines changed: 7 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@
2424
import java.util.Iterator;
2525
import java.util.List;
2626
import java.util.Locale;
27-
import java.util.NoSuchElementException;
2827
import java.util.Objects;
2928
import java.util.Set;
3029
import java.util.concurrent.TimeUnit;
@@ -33,7 +32,6 @@
3332
import java.util.concurrent.locks.Lock;
3433
import java.util.concurrent.locks.ReentrantLock;
3534
import org.apache.lucene.codecs.Codec;
36-
import org.apache.lucene.document.NumericDocValuesField;
3735
import org.apache.lucene.index.DocumentsWriterDeleteQueue.DeleteSlice;
3836
import org.apache.lucene.search.DocIdSetIterator;
3937
import org.apache.lucene.store.Directory;
@@ -140,7 +138,7 @@ void abort() throws IOException {
140138
private int[] deleteDocIDs = new int[0];
141139
private int numDeletedDocIds = 0;
142140
private final int indexMajorVersionCreated;
143-
private final IndexingChain.ReservedField<NumericDocValuesField> parentField;
141+
private final boolean hasParentField;
144142

145143
DocumentsWriterPerThread(
146144
int indexMajorVersionCreated,
@@ -197,13 +195,7 @@ void abort() throws IOException {
197195
fieldInfos,
198196
indexWriterConfig,
199197
this::onAbortingException);
200-
if (indexWriterConfig.getParentField() != null) {
201-
this.parentField =
202-
indexingChain.markAsReserved(
203-
new NumericDocValuesField(indexWriterConfig.getParentField(), -1));
204-
} else {
205-
this.parentField = null;
206-
}
198+
this.hasParentField = indexWriterConfig.getParentField() != null;
207199
}
208200

209201
final void testPoint(String message) {
@@ -249,12 +241,10 @@ long updateDocuments(
249241
final Iterator<? extends Iterable<? extends IndexableField>> iterator = docs.iterator();
250242
while (iterator.hasNext()) {
251243
Iterable<? extends IndexableField> doc = iterator.next();
252-
if (parentField != null) {
253-
if (iterator.hasNext() == false) {
254-
doc = addParentField(doc, parentField);
255-
}
256-
} else if (segmentInfo.getIndexSort() != null
257-
&& iterator.hasNext()
244+
final boolean isLastDoc = iterator.hasNext() == false;
245+
if (hasParentField == false
246+
&& segmentInfo.getIndexSort() != null
247+
&& isLastDoc == false
258248
&& indexMajorVersionCreated >= Version.LUCENE_10_0_0.major) {
259249
// sort is configured but parent field is missing, yet we have a doc-block
260250
// yet we must not fail if this index was created in an earlier version where this
@@ -271,7 +261,7 @@ long updateDocuments(
271261
// vs non-aborting exceptions):
272262
reserveOneDoc();
273263
try {
274-
indexingChain.processDocument(numDocsInRAM++, doc);
264+
indexingChain.processDocument(numDocsInRAM++, doc, isLastDoc);
275265
} finally {
276266
onNewDocOnRAM.run();
277267
}
@@ -294,34 +284,6 @@ long updateDocuments(
294284
}
295285
}
296286

297-
private Iterable<? extends IndexableField> addParentField(
298-
Iterable<? extends IndexableField> doc, IndexableField parentField) {
299-
return () -> {
300-
final Iterator<? extends IndexableField> first = doc.iterator();
301-
return new Iterator<>() {
302-
IndexableField additionalField = parentField;
303-
304-
@Override
305-
public boolean hasNext() {
306-
return additionalField != null || first.hasNext();
307-
}
308-
309-
@Override
310-
public IndexableField next() {
311-
if (additionalField != null) {
312-
IndexableField field = additionalField;
313-
additionalField = null;
314-
return field;
315-
}
316-
if (first.hasNext()) {
317-
return first.next();
318-
}
319-
throw new NoSuchElementException();
320-
}
321-
};
322-
};
323-
}
324-
325287
private long finishDocuments(DocumentsWriterDeleteQueue.Node<?> deleteNode, int docIdUpTo) {
326288
/*
327289
* here we actually finish the document in two steps 1. push the delete into

0 commit comments

Comments
 (0)