Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,8 @@ Bug Fixes

* GITHUB#15656: Fix wrong BitSet result in MemoryAccountingBitsetCollector (Binlong Gao)

* GITHUB#15656: Fix SpanOrQuery scores each document using the combined IDF of all clauses rather than the matched clauses(Binlong Gao)

Other
---------------------
* GITHUB#15586: Document that scoring and ranking may change across major Lucene versions, and that applications requiring stable ranking should explicitly configure Similarity. (Parveen Saini)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
package org.apache.lucene.queries.payloads;

import java.io.IOException;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import org.apache.lucene.index.LeafReaderContext;
Expand All @@ -26,6 +27,7 @@
import org.apache.lucene.index.TermStates;
import org.apache.lucene.queries.spans.FilterSpans;
import org.apache.lucene.queries.spans.SpanCollector;
import org.apache.lucene.queries.spans.SpanOrQuery;
import org.apache.lucene.queries.spans.SpanQuery;
import org.apache.lucene.queries.spans.SpanScorer;
import org.apache.lucene.queries.spans.SpanWeight;
Expand Down Expand Up @@ -193,6 +195,33 @@ public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOExcepti
}
NumericDocValues norms = context.reader().getNormValues(field);
PayloadSpans payloadSpans = new PayloadSpans(spans, decoder);
// When the inner weight is SpanOrWeight, use per-clause scorers so that
// getSpanScore() is consistent with innerWeight.explain()
if (innerWeight instanceof SpanOrQuery.SpanOrWeight spanOrWeight) {
List<SpanScorer> perClause = spanOrWeight.createPerClauseScorers(context);
if (perClause == null) {
return null;
}
final var scorer =
new PayloadSpanScorer(payloadSpans, innerWeight.getSimScorer(), norms) {
@Override
protected float getSpanScore() throws IOException {
// Advance each per-clause sub-scorer to the current doc and sum scores.
int doc = docID();
float sum = 0;
for (SpanScorer sub : perClause) {
if (sub.docID() < doc) {
sub.iterator().advance(doc);
}
if (sub.docID() == doc) {
sum += sub.score();
}
}
return sum;
}
};
return new DefaultScorerSupplier(scorer);
}
final var scorer = new PayloadSpanScorer(payloadSpans, innerWeight.getSimScorer(), norms);
return new DefaultScorerSupplier(scorer);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,16 @@
import java.util.List;
import java.util.Map;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermStates;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.Explanation;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryVisitor;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.ScorerSupplier;
import org.apache.lucene.search.TwoPhaseIterator;
import org.apache.lucene.search.Weight;
import org.apache.lucene.util.PriorityQueue;
Expand Down Expand Up @@ -165,6 +168,62 @@ public void extractTermStates(Map<Term, TermStates> contexts) {
}
}

@Override
public ScorerSupplier scorerSupplier(LeafReaderContext context) throws IOException {
List<SpanScorer> subScorers = createPerClauseScorers(context);
if (subScorers == null) {
return null;
}

Spans mergedSpans = getSpans(context, Postings.POSITIONS);
NumericDocValues norms = context.reader().getNormValues(field);
final var scorer = new SpanOrScorer(subScorers, mergedSpans, norms);
return new ScorerSupplier() {
@Override
public SpanOrScorer get(long leadCost) {
return scorer;
}

@Override
public long cost() {
return scorer.iterator().cost();
}
};
}

/**
* Creates per-clause SpanScorers, each using its own simScorer. Returns null if no clauses have
* spans in this segment.
*/
public List<SpanScorer> createPerClauseScorers(LeafReaderContext context) throws IOException {
List<SpanScorer> subScorers = new ArrayList<>();
for (SpanWeight w : subWeights) {
Spans spans = w.getSpans(context, Postings.POSITIONS);
if (spans != null) {
NumericDocValues norms = context.reader().getNormValues(field);
subScorers.add(new SpanScorer(spans, w.getSimScorer(), norms));
}
}
return subScorers.isEmpty() ? null : subScorers;
}

@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
List<Explanation> subExplanations = new ArrayList<>();
float sum = 0;
for (SpanWeight w : subWeights) {
Explanation e = w.explain(context, doc);
if (e.isMatch()) {
subExplanations.add(e);
sum += e.getValue().floatValue();
}
}
if (subExplanations.isEmpty()) {
return Explanation.noMatch("no matching term");
}
return Explanation.match(sum, "sum of:", subExplanations);
}

@Override
public Spans getSpans(final LeafReaderContext context, Postings requiredPostings)
throws IOException {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.queries.spans;

import java.io.IOException;
import java.util.List;
import org.apache.lucene.index.NumericDocValues;

/**
* Scorer for {@link SpanOrQuery} that sums scores from each matching sub-clause independently, so
* each clause uses only its own terms' IDF rather than a combined IDF across all clauses.
*/
final class SpanOrScorer extends SpanScorer {

private final List<SpanScorer> subScorers;

SpanOrScorer(List<SpanScorer> subScorers, Spans mergedSpans, NumericDocValues norms)
throws IOException {
// Pass null simScorer: scoreCurrentDoc() is overridden to sum sub-scorer scores.
super(mergedSpans, null, norms);
this.subScorers = subScorers;
}

@Override
protected float scoreCurrentDoc() throws IOException {
int doc = docID();
float sum = 0;
for (SpanScorer sub : subScorers) {
if (sub.docID() < doc) {
sub.iterator().advance(doc);
}
if (sub.docID() == doc && sub.scorer != null) {
sum += sub.score();
}
}
return sum;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
package org.apache.lucene.queries.spans;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
Expand Down Expand Up @@ -136,6 +138,62 @@ public void setUp() throws Exception {
}
}

/**
* Verify that SpanOrQuery scores each document using only the IDF of the term that matched, not
* the combined IDF of all clauses. With two docs (foo:bar, foo:baz) and a spanOr([foo:bar,
* foo:baz]) query, each doc should score the same as the equivalent bool-should query (i.e.
* single-term IDF, not summed IDF).
*/
public void testSpanOrScoresWithPerClauseIdf() throws Exception {
Directory dir = newDirectory();
RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
Document doc1 = new Document();
doc1.add(newField("foo", "bar", ft));
iw.addDocument(doc1);
Document doc2 = new Document();
doc2.add(newField("foo", "baz", ft));
iw.addDocument(doc2);
IndexReader ir = iw.getReader();
iw.close();

IndexSearcher is = newSearcher(ir);
is.setSimilarity(new BM25Similarity());

SpanTermQuery spanBar = new SpanTermQuery(new Term("foo", "bar"));
SpanTermQuery spanBaz = new SpanTermQuery(new Term("foo", "baz"));

// Score each term individually to get the expected single-term score
float scoreBar = is.search(spanBar, 10).scoreDocs[0].score;
float scoreBaz = is.search(spanBaz, 10).scoreDocs[0].score;

// SpanOrQuery should score each doc the same as its individual SpanTermQuery
TopDocs spanOrResults = is.search(new SpanOrQuery(spanBar, spanBaz), 10);
assertEquals(2, spanOrResults.totalHits.value());

Map<Integer, Float> spanOrScores = new HashMap<>();
for (var sd : spanOrResults.scoreDocs) {
spanOrScores.put(sd.doc, sd.score);
}

int docBar = is.search(spanBar, 10).scoreDocs[0].doc;
int docBaz = is.search(spanBaz, 10).scoreDocs[0].doc;

assertEquals(
"foo:bar doc score should equal single SpanTermQuery score, not combined-IDF score",
scoreBar,
spanOrScores.get(docBar),
1e-5f);
assertEquals(
"foo:baz doc score should equal single SpanTermQuery score, not combined-IDF score",
scoreBaz,
spanOrScores.get(docBaz),
1e-5f);

ir.close();
dir.close();
}

/** make sure all sims work with spanOR(termX, termY) where termY does not exist */
public void testCrazySpans() throws Exception {
// historically this was a problem, but sim's no longer have to score terms that dont exist
Expand Down
Loading