diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 08366c78888d..f350abb1e60b 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -265,6 +265,8 @@ Bug Fixes * GITHUB#15656: Fix wrong BitSet result in MemoryAccountingBitsetCollector (Binlong Gao) +* GITHUB#15656: Fix SpanOrQuery scores each document using the combined IDF of all clauses rather than the matched clauses(Binlong Gao) + Other --------------------- * GITHUB#15586: Document that scoring and ranking may change across major Lucene versions, and that applications requiring stable ranking should explicitly configure Similarity. (Parveen Saini) diff --git a/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java index 59e35aa7604d..acac02ff8276 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/payloads/PayloadScoreQuery.java @@ -17,6 +17,7 @@ package org.apache.lucene.queries.payloads; import java.io.IOException; +import java.util.List; import java.util.Map; import java.util.Objects; import org.apache.lucene.index.LeafReaderContext; @@ -26,6 +27,7 @@ import org.apache.lucene.index.TermStates; import org.apache.lucene.queries.spans.FilterSpans; import org.apache.lucene.queries.spans.SpanCollector; +import org.apache.lucene.queries.spans.SpanOrQuery; import org.apache.lucene.queries.spans.SpanQuery; import org.apache.lucene.queries.spans.SpanScorer; import org.apache.lucene.queries.spans.SpanWeight; @@ -193,6 +195,33 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti } NumericDocValues norms = context.reader().getNormValues(field); PayloadSpans payloadSpans = new PayloadSpans(spans, decoder); + // When the inner weight is SpanOrWeight, use per-clause scorers so that + // getSpanScore() is consistent with innerWeight.explain() + if (innerWeight instanceof SpanOrQuery.SpanOrWeight spanOrWeight) { + List perClause = spanOrWeight.createPerClauseScorers(context); + if (perClause == null) { + return null; + } + final var scorer = + new PayloadSpanScorer(payloadSpans, innerWeight.getSimScorer(), norms) { + @Override + protected float getSpanScore() throws IOException { + // Advance each per-clause sub-scorer to the current doc and sum scores. + int doc = docID(); + float sum = 0; + for (SpanScorer sub : perClause) { + if (sub.docID() < doc) { + sub.iterator().advance(doc); + } + if (sub.docID() == doc) { + sum += sub.score(); + } + } + return sum; + } + }; + return new DefaultScorerSupplier(scorer); + } final var scorer = new PayloadSpanScorer(payloadSpans, innerWeight.getSimScorer(), norms); return new DefaultScorerSupplier(scorer); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanOrQuery.java b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanOrQuery.java index 0acdd61f71e8..7de61be522af 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanOrQuery.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanOrQuery.java @@ -23,13 +23,16 @@ import java.util.List; import java.util.Map; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermStates; import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.Explanation; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.ScorerSupplier; import org.apache.lucene.search.TwoPhaseIterator; import org.apache.lucene.search.Weight; import org.apache.lucene.util.PriorityQueue; @@ -165,6 +168,62 @@ public void extractTermStates(Map contexts) { } } + @Override + public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException { + List subScorers = createPerClauseScorers(context); + if (subScorers == null) { + return null; + } + + Spans mergedSpans = getSpans(context, Postings.POSITIONS); + NumericDocValues norms = context.reader().getNormValues(field); + final var scorer = new SpanOrScorer(subScorers, mergedSpans, norms); + return new ScorerSupplier() { + @Override + public SpanOrScorer get(long leadCost) { + return scorer; + } + + @Override + public long cost() { + return scorer.iterator().cost(); + } + }; + } + + /** + * Creates per-clause SpanScorers, each using its own simScorer. Returns null if no clauses have + * spans in this segment. + */ + public List createPerClauseScorers(LeafReaderContext context) throws IOException { + List subScorers = new ArrayList<>(); + for (SpanWeight w : subWeights) { + Spans spans = w.getSpans(context, Postings.POSITIONS); + if (spans != null) { + NumericDocValues norms = context.reader().getNormValues(field); + subScorers.add(new SpanScorer(spans, w.getSimScorer(), norms)); + } + } + return subScorers.isEmpty() ? null : subScorers; + } + + @Override + public Explanation explain(LeafReaderContext context, int doc) throws IOException { + List subExplanations = new ArrayList<>(); + float sum = 0; + for (SpanWeight w : subWeights) { + Explanation e = w.explain(context, doc); + if (e.isMatch()) { + subExplanations.add(e); + sum += e.getValue().floatValue(); + } + } + if (subExplanations.isEmpty()) { + return Explanation.noMatch("no matching term"); + } + return Explanation.match(sum, "sum of:", subExplanations); + } + @Override public Spans getSpans(final LeafReaderContext context, Postings requiredPostings) throws IOException { diff --git a/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanOrScorer.java b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanOrScorer.java new file mode 100644 index 000000000000..aca6bc16464f --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/spans/SpanOrScorer.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.queries.spans; + +import java.io.IOException; +import java.util.List; +import org.apache.lucene.index.NumericDocValues; + +/** + * Scorer for {@link SpanOrQuery} that sums scores from each matching sub-clause independently, so + * each clause uses only its own terms' IDF rather than a combined IDF across all clauses. + */ +final class SpanOrScorer extends SpanScorer { + + private final List subScorers; + + SpanOrScorer(List subScorers, Spans mergedSpans, NumericDocValues norms) + throws IOException { + // Pass null simScorer: scoreCurrentDoc() is overridden to sum sub-scorer scores. + super(mergedSpans, null, norms); + this.subScorers = subScorers; + } + + @Override + protected float scoreCurrentDoc() throws IOException { + int doc = docID(); + float sum = 0; + for (SpanScorer sub : subScorers) { + if (sub.docID() < doc) { + sub.iterator().advance(doc); + } + if (sub.docID() == doc && sub.scorer != null) { + sum += sub.score(); + } + } + return sum; + } +} diff --git a/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanSimilarity.java b/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanSimilarity.java index 7b4b76e04fc7..177d7d4e7485 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanSimilarity.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/spans/TestSpanSimilarity.java @@ -18,7 +18,9 @@ package org.apache.lucene.queries.spans; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; +import java.util.Map; import org.apache.lucene.document.Document; import org.apache.lucene.document.FieldType; import org.apache.lucene.document.TextField; @@ -136,6 +138,62 @@ public void setUp() throws Exception { } } + /** + * Verify that SpanOrQuery scores each document using only the IDF of the term that matched, not + * the combined IDF of all clauses. With two docs (foo:bar, foo:baz) and a spanOr([foo:bar, + * foo:baz]) query, each doc should score the same as the equivalent bool-should query (i.e. + * single-term IDF, not summed IDF). + */ + public void testSpanOrScoresWithPerClauseIdf() throws Exception { + Directory dir = newDirectory(); + RandomIndexWriter iw = new RandomIndexWriter(random(), dir); + FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); + Document doc1 = new Document(); + doc1.add(newField("foo", "bar", ft)); + iw.addDocument(doc1); + Document doc2 = new Document(); + doc2.add(newField("foo", "baz", ft)); + iw.addDocument(doc2); + IndexReader ir = iw.getReader(); + iw.close(); + + IndexSearcher is = newSearcher(ir); + is.setSimilarity(new BM25Similarity()); + + SpanTermQuery spanBar = new SpanTermQuery(new Term("foo", "bar")); + SpanTermQuery spanBaz = new SpanTermQuery(new Term("foo", "baz")); + + // Score each term individually to get the expected single-term score + float scoreBar = is.search(spanBar, 10).scoreDocs[0].score; + float scoreBaz = is.search(spanBaz, 10).scoreDocs[0].score; + + // SpanOrQuery should score each doc the same as its individual SpanTermQuery + TopDocs spanOrResults = is.search(new SpanOrQuery(spanBar, spanBaz), 10); + assertEquals(2, spanOrResults.totalHits.value()); + + Map spanOrScores = new HashMap<>(); + for (var sd : spanOrResults.scoreDocs) { + spanOrScores.put(sd.doc, sd.score); + } + + int docBar = is.search(spanBar, 10).scoreDocs[0].doc; + int docBaz = is.search(spanBaz, 10).scoreDocs[0].doc; + + assertEquals( + "foo:bar doc score should equal single SpanTermQuery score, not combined-IDF score", + scoreBar, + spanOrScores.get(docBar), + 1e-5f); + assertEquals( + "foo:baz doc score should equal single SpanTermQuery score, not combined-IDF score", + scoreBaz, + spanOrScores.get(docBaz), + 1e-5f); + + ir.close(); + dir.close(); + } + /** make sure all sims work with spanOR(termX, termY) where termY does not exist */ public void testCrazySpans() throws Exception { // historically this was a problem, but sim's no longer have to score terms that dont exist