Skip to content

Commit fa0e704

Browse files
authored
Rename CollectionStatistics to FieldStats and TermStatistics to TermStats. (#15929)
1 parent ac19f0a commit fa0e704

50 files changed

Lines changed: 322 additions & 378 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

lucene/CHANGES.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ http://s.apache.org/luceneversions
77

88
API Changes
99
---------------------
10+
* GITHUB#15929: Rename CollectionStatistics to FieldStats and TermStatistics to TermStats. (Zhou Hui)
1011

1112
* GITHUB#15763: Deprecate Operations.complement() method. This operation can be slow and is not
1213
recommended for production use. It will be removed in Lucene 12. (Saurabh Singh)

lucene/MIGRATE.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,10 @@ Automaton dfa = Operations.determinize(new RegExp(re).toAutomaton(), 10000);
174174
Query query = new AutomatonQuery(new Term("myfield", re), dfa);
175175
```
176176

177+
### CollectionStatistics and TermStatistics have been renamed to FieldStats and TermStats (GITHUB#15929)
178+
179+
Corresponding methods and parameters have been renamed accordingly.
180+
177181
## Migration from Lucene 10.4 to Lucene 10.5
178182

179183
### `[Byte|Float]VectorSimilarityQuery` now performs adaptive HNSW graph traversal

lucene/core/src/java/org/apache/lucene/search/CombinedFieldQuery.java

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,8 @@
5252
*
5353
* <ol>
5454
* <li>Given a list of fields and weights, it pretends there is a synthetic combined field where
55-
* all terms have been indexed. It computes new term and collection statistics for this
56-
* combined field.
55+
* all terms have been indexed. It computes new term and field statistics for this combined
56+
* field.
5757
* <li>It uses a disjunction iterator and {@link IndexSearcher#getSimilarity} to score documents.
5858
* </ol>
5959
*
@@ -279,41 +279,37 @@ class CombinedFieldWeight extends Weight {
279279
TermStates ts = TermStates.build(searcher, fieldTerms[i], true);
280280
termStates[i] = ts;
281281
if (ts.docFreq() > 0) {
282-
TermStatistics termStats =
283-
searcher.termStatistics(fieldTerms[i], ts.docFreq(), ts.totalTermFreq());
282+
TermStats termStats = searcher.termStats(fieldTerms[i], ts.docFreq(), ts.totalTermFreq());
284283
docFreq = Math.max(termStats.docFreq(), docFreq);
285284
totalTermFreq += (double) field.weight * termStats.totalTermFreq();
286285
}
287286
}
288287
if (docFreq > 0) {
289-
CollectionStatistics pseudoCollectionStats = mergeCollectionStatistics(searcher);
290-
TermStatistics pseudoTermStatistics =
291-
new TermStatistics(new BytesRef("pseudo_term"), docFreq, Math.max(1, totalTermFreq));
292-
this.simWeight =
293-
searcher.getSimilarity().scorer(boost, pseudoCollectionStats, pseudoTermStatistics);
288+
FieldStats pseudoFieldStats = mergeFieldStats(searcher);
289+
TermStats pseudoTermStats =
290+
new TermStats(new BytesRef("pseudo_term"), docFreq, Math.max(1, totalTermFreq));
291+
this.simWeight = searcher.getSimilarity().scorer(boost, pseudoFieldStats, pseudoTermStats);
294292
} else {
295293
this.simWeight = null;
296294
}
297295
}
298296

299-
private CollectionStatistics mergeCollectionStatistics(IndexSearcher searcher)
300-
throws IOException {
297+
private FieldStats mergeFieldStats(IndexSearcher searcher) throws IOException {
301298
long maxDoc = 0;
302299
long docCount = 0;
303300
long sumTotalTermFreq = 0;
304301
long sumDocFreq = 0;
305302
for (FieldAndWeight fieldWeight : fieldAndWeights.values()) {
306-
CollectionStatistics collectionStats = searcher.collectionStatistics(fieldWeight.field);
307-
if (collectionStats != null) {
308-
maxDoc = Math.max(collectionStats.maxDoc(), maxDoc);
309-
docCount = Math.max(collectionStats.docCount(), docCount);
310-
sumDocFreq = Math.max(collectionStats.sumDocFreq(), sumDocFreq);
311-
sumTotalTermFreq += (double) fieldWeight.weight * collectionStats.sumTotalTermFreq();
303+
FieldStats fieldStats = searcher.fieldStats(fieldWeight.field);
304+
if (fieldStats != null) {
305+
maxDoc = Math.max(fieldStats.maxDoc(), maxDoc);
306+
docCount = Math.max(fieldStats.docCount(), docCount);
307+
sumDocFreq = Math.max(fieldStats.sumDocFreq(), sumDocFreq);
308+
sumTotalTermFreq += (double) fieldWeight.weight * fieldStats.sumTotalTermFreq();
312309
}
313310
}
314311

315-
return new CollectionStatistics(
316-
"pseudo_field", maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
312+
return new FieldStats("pseudo_field", maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
317313
}
318314

319315
@Override

lucene/core/src/java/org/apache/lucene/search/CollectionStatistics.java renamed to lucene/core/src/java/org/apache/lucene/search/FieldStats.java

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import java.util.Objects;
2020

2121
/**
22-
* Contains statistics for a collection (field).
22+
* Contains statistics for a field.
2323
*
2424
* <p>This class holds statistics across all documents for scoring purposes:
2525
*
@@ -54,30 +54,30 @@
5454
* Terms#getDocCount()
5555
* @param sumTotalTermFreq The total number of tokens for this field , in the range [{@link
5656
* #sumDocFreq()} .. {@link Long#MAX_VALUE}]. This is the "word count" for this field across all
57-
* documents. It is the sum of {@link TermStatistics#totalTermFreq()} across all terms. It is
58-
* also the sum of each document's field length across all documents.
57+
* documents. It is the sum of {@link TermStats#totalTermFreq()} across all terms. It is also
58+
* the sum of each document's field length across all documents.
5959
* <p>This value is always a positive number, and always at least {@link #sumDocFreq()}. @see
6060
* Terms#getSumTotalTermFreq()
6161
* @param sumDocFreq The total number of posting list entries for this field, in the range [{@link
6262
* #docCount()} .. {@link #sumTotalTermFreq()}]. This is the sum of term-document pairs: the sum
63-
* of {@link TermStatistics#docFreq()} across all terms. It is also the sum of each document's
64-
* unique term count for this field across all documents.
63+
* of {@link TermStats#docFreq()} across all terms. It is also the sum of each document's unique
64+
* term count for this field across all documents.
6565
* <p>This value is always a positive number, always at least {@link #docCount()}, and never
6666
* exceeds {@link #sumTotalTermFreq()}. @see Terms#getSumDocFreq()
6767
* @lucene.experimental
6868
*/
69-
public record CollectionStatistics(
69+
public record FieldStats(
7070
String field, long maxDoc, long docCount, long sumTotalTermFreq, long sumDocFreq) {
7171
/**
72-
* Creates statistics instance for a collection (field).
72+
* Creates statistics instance for a field.
7373
*
7474
* @throws IllegalArgumentException if {@code maxDoc} is negative or zero.
7575
* @throws IllegalArgumentException if {@code docCount} is negative or zero.
7676
* @throws IllegalArgumentException if {@code docCount} is more than {@code maxDoc}.
7777
* @throws IllegalArgumentException if {@code sumDocFreq} is less than {@code docCount}.
7878
* @throws IllegalArgumentException if {@code sumTotalTermFreq} is less than {@code sumDocFreq}.
7979
*/
80-
public CollectionStatistics {
80+
public FieldStats {
8181
Objects.requireNonNull(field);
8282
if (maxDoc <= 0) {
8383
throw new IllegalArgumentException("maxDoc must be positive, maxDoc: " + maxDoc);

lucene/core/src/java/org/apache/lucene/search/IndexSearcher.java

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,32 +1106,31 @@ public String toString() {
11061106
}
11071107

11081108
/**
1109-
* Returns {@link TermStatistics} for a term.
1109+
* Returns {@link TermStats} for a term.
11101110
*
11111111
* <p>This can be overridden for example, to return a term's statistics across a distributed
11121112
* collection.
11131113
*
11141114
* @param docFreq The document frequency of the term. It must be greater or equal to 1.
11151115
* @param totalTermFreq The total term frequency.
1116-
* @return A {@link TermStatistics} (never null).
1116+
* @return A {@link TermStats} (never null).
11171117
* @lucene.experimental
11181118
*/
1119-
public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq)
1120-
throws IOException {
1119+
public TermStats termStats(Term term, int docFreq, long totalTermFreq) throws IOException {
11211120
// This constructor will throw an exception if docFreq <= 0.
1122-
return new TermStatistics(term.bytes(), docFreq, totalTermFreq);
1121+
return new TermStats(term.bytes(), docFreq, totalTermFreq);
11231122
}
11241123

11251124
/**
1126-
* Returns {@link CollectionStatistics} for a field, or {@code null} if the field does not exist
1127-
* (has no indexed terms)
1125+
* Returns {@link FieldStats} for a field, or {@code null} if the field does not exist (has no
1126+
* indexed terms)
11281127
*
11291128
* <p>This can be overridden for example, to return a field's statistics across a distributed
11301129
* collection.
11311130
*
11321131
* @lucene.experimental
11331132
*/
1134-
public CollectionStatistics collectionStatistics(String field) throws IOException {
1133+
public FieldStats fieldStats(String field) throws IOException {
11351134
assert field != null;
11361135
long docCount = 0;
11371136
long sumTotalTermFreq = 0;
@@ -1145,7 +1144,7 @@ public CollectionStatistics collectionStatistics(String field) throws IOExceptio
11451144
if (docCount == 0) {
11461145
return null;
11471146
}
1148-
return new CollectionStatistics(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq);
1147+
return new FieldStats(field, reader.maxDoc(), docCount, sumTotalTermFreq, sumDocFreq);
11491148
}
11501149

11511150
/**

lucene/core/src/java/org/apache/lucene/search/MultiPhraseQuery.java

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -216,7 +216,7 @@ public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float bo
216216
protected Similarity.SimScorer getStats(IndexSearcher searcher) throws IOException {
217217

218218
// compute idf
219-
ArrayList<TermStatistics> allTermStats = new ArrayList<>();
219+
ArrayList<TermStats> allTermStats = new ArrayList<>();
220220
for (final Term[] terms : termArrays) {
221221
for (Term term : terms) {
222222
TermStates ts = termStates.get(term);
@@ -225,17 +225,15 @@ protected Similarity.SimScorer getStats(IndexSearcher searcher) throws IOExcepti
225225
termStates.put(term, ts);
226226
}
227227
if (scoreMode.needsScores() && ts.docFreq() > 0) {
228-
allTermStats.add(searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq()));
228+
allTermStats.add(searcher.termStats(term, ts.docFreq(), ts.totalTermFreq()));
229229
}
230230
}
231231
}
232232
if (allTermStats.isEmpty()) {
233233
return null; // none of the terms were found, we won't use sim at all
234234
} else {
235235
return similarity.scorer(
236-
boost,
237-
searcher.collectionStatistics(field),
238-
allTermStats.toArray(TermStatistics[]::new));
236+
boost, searcher.fieldStats(field), allTermStats.toArray(TermStats[]::new));
239237
}
240238
}
241239

lucene/core/src/java/org/apache/lucene/search/PhraseQuery.java

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -469,24 +469,21 @@ protected Similarity.SimScorer getStats(IndexSearcher searcher) throws IOExcepti
469469
"PhraseWeight requires that the first position is 0, call rewrite first");
470470
}
471471
states = new TermStates[terms.length];
472-
TermStatistics[] termStats = new TermStatistics[terms.length];
472+
TermStats[] termStats = new TermStats[terms.length];
473473
int termUpTo = 0;
474474
for (int i = 0; i < terms.length; i++) {
475475
final Term term = terms[i];
476476
states[i] = TermStates.build(searcher, term, scoreMode.needsScores());
477477
if (scoreMode.needsScores()) {
478478
TermStates ts = states[i];
479479
if (ts.docFreq() > 0) {
480-
termStats[termUpTo++] =
481-
searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq());
480+
termStats[termUpTo++] = searcher.termStats(term, ts.docFreq(), ts.totalTermFreq());
482481
}
483482
}
484483
}
485484
if (termUpTo > 0) {
486485
return similarity.scorer(
487-
boost,
488-
searcher.collectionStatistics(field),
489-
ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
486+
boost, searcher.fieldStats(field), ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
490487
} else {
491488
return null; // no terms at all, we won't use similarity
492489
}

lucene/core/src/java/org/apache/lucene/search/SynonymQuery.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ class SynonymWeight extends Weight {
207207
super(query);
208208
assert scoreMode.needsScores();
209209
this.scoreMode = scoreMode;
210-
CollectionStatistics collectionStats = searcher.collectionStatistics(field);
210+
FieldStats fieldStats = searcher.fieldStats(field);
211211
long docFreq = 0;
212212
long totalTermFreq = 0;
213213
termStates = new TermStates[terms.length];
@@ -216,17 +216,16 @@ class SynonymWeight extends Weight {
216216
TermStates ts = TermStates.build(searcher, term, true);
217217
termStates[i] = ts;
218218
if (ts.docFreq() > 0) {
219-
TermStatistics termStats =
220-
searcher.termStatistics(term, ts.docFreq(), ts.totalTermFreq());
219+
TermStats termStats = searcher.termStats(term, ts.docFreq(), ts.totalTermFreq());
221220
docFreq = Math.max(termStats.docFreq(), docFreq);
222221
totalTermFreq += termStats.totalTermFreq();
223222
}
224223
}
225224
this.similarity = searcher.getSimilarity();
226225
if (docFreq > 0) {
227-
TermStatistics pseudoStats =
228-
new TermStatistics(new BytesRef("synonym pseudo-term"), docFreq, totalTermFreq);
229-
this.simWeight = similarity.scorer(boost, collectionStats, pseudoStats);
226+
TermStats pseudoStats =
227+
new TermStats(new BytesRef("synonym pseudo-term"), docFreq, totalTermFreq);
228+
this.simWeight = similarity.scorer(boost, fieldStats, pseudoStats);
230229
} else {
231230
this.simWeight = null; // no terms exist at all, we won't use similarity
232231
}

lucene/core/src/java/org/apache/lucene/search/TermQuery.java

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,18 +58,18 @@ public TermWeight(
5858
this.termStates = termStates;
5959
this.similarity = searcher.getSimilarity();
6060

61-
final CollectionStatistics collectionStats;
62-
final TermStatistics termStats;
61+
final FieldStats fieldStats;
62+
final TermStats termStats;
6363
if (scoreMode.needsScores()) {
64-
collectionStats = searcher.collectionStatistics(term.field());
64+
fieldStats = searcher.fieldStats(term.field());
6565
termStats =
6666
termStates.docFreq() > 0
67-
? searcher.termStatistics(term, termStates.docFreq(), termStates.totalTermFreq())
67+
? searcher.termStats(term, termStates.docFreq(), termStates.totalTermFreq())
6868
: null;
6969
} else {
7070
// we do not need the actual stats, use fake stats with docFreq=maxDoc=ttf=1
71-
collectionStats = new CollectionStatistics(term.field(), 1, 1, 1, 1);
72-
termStats = new TermStatistics(term.bytes(), 1, 1);
71+
fieldStats = new FieldStats(term.field(), 1, 1, 1, 1);
72+
termStats = new TermStats(term.bytes(), 1, 1);
7373
}
7474

7575
if (termStats == null) {
@@ -79,7 +79,7 @@ public TermWeight(
7979
// allocations in case default BM25Scorer is used.
8080
// See: https://github.com/apache/lucene/issues/12297
8181
if (scoreMode.needsScores()) {
82-
this.simScorer = similarity.scorer(boost, collectionStats, termStats);
82+
this.simScorer = similarity.scorer(boost, fieldStats, termStats);
8383
} else {
8484
// Assigning a dummy scorer as this is not expected to be called since scores are not
8585
// needed.

lucene/core/src/java/org/apache/lucene/search/TermScorer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ public int docID() {
8383
}
8484

8585
/** Returns term frequency in the current document. */
86-
public final int freq() throws IOException {
86+
public int freq() throws IOException {
8787
return postingsEnum.freq();
8888
}
8989

0 commit comments

Comments
 (0)