diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 9db4d195d4ba..d5ded12cb155 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -130,6 +130,11 @@ Optimizations Bug Fixes --------------------- +* GITHUB#15812: CJKBigramFilter now produces consistent token positions regardless of the + outputUnigrams setting. Previously, when outputUnigrams=false, bigrams did not account for + spanning two character positions, causing position mismatches after word breaks that broke + phrase queries in combined unigram+bigram indexing strategies. (Sarah Syarofina) + * GITHUB#14049: Randomize KNN codec params in RandomCodec. Fixes scalar quantization div-by-zero when all values are identical. (Mike Sokolov) diff --git a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java index ca69ecd9f893..269d1d9b692e 100644 --- a/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java +++ b/lucene/analysis/common/src/java/org/apache/lucene/analysis/cjk/CJKBigramFilter.java @@ -91,6 +91,12 @@ public final class CJKBigramFilter extends TokenFilter { private final boolean outputUnigrams; private boolean ngramState; // false = output unigram, true = output bigram + // When outputUnigrams=false, a bigram spans two character positions but only advances the + // position counter by 1. After a CJK segment boundary (word break or non-CJK token), the next + // token must account for the "missing" trailing unigram position. These fields track this. + private boolean hadBigrams; // true if bigrams were emitted from the current CJK segment + private int deferredPosInc; // extra position increment to apply to the next emitted token + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); @@ -191,6 +197,10 @@ public boolean incrementToken() throws IOException { // otherwise, we clear the buffer and start over. if (offsetAtt.startOffset() != lastEndOffset) { // unaligned, clear queue + if (hadBigrams) { + deferredPosInc++; + hadBigrams = false; + } if (hasBufferedUnigram()) { // we have a buffered unigram, and we peeked ahead to see if we could form @@ -209,6 +219,11 @@ public boolean incrementToken() throws IOException { // not a CJK type: we just return these as-is. + if (hadBigrams) { + deferredPosInc++; + hadBigrams = false; + } + if (hasBufferedUnigram()) { // we have a buffered unigram, and we peeked ahead to see if we could form @@ -219,6 +234,10 @@ public boolean incrementToken() throws IOException { flushUnigram(); return true; } + if (deferredPosInc > 0) { + posIncAtt.setPositionIncrement(posIncAtt.getPositionIncrement() + deferredPosInc); + deferredPosInc = 0; + } return true; } } else { @@ -317,6 +336,13 @@ private void flushBigram() { if (outputUnigrams) { posIncAtt.setPositionIncrement(0); posLengthAtt.setPositionLength(2); + } else { + // Apply any deferred position increment from a previous CJK segment boundary. + if (deferredPosInc > 0) { + posIncAtt.setPositionIncrement(1 + deferredPosInc); + deferredPosInc = 0; + } + hadBigrams = true; } index++; } @@ -333,6 +359,10 @@ private void flushUnigram() { termAtt.setLength(len); offsetAtt.setOffset(startOffset[index], endOffset[index]); typeAtt.setType(SINGLE_TYPE); + if (deferredPosInc > 0) { + posIncAtt.setPositionIncrement(1 + deferredPosInc); + deferredPosInc = 0; + } index++; } @@ -364,5 +394,7 @@ public void reset() throws IOException { loneState = null; exhausted = false; ngramState = false; + hadBigrams = false; + deferredPosInc = 0; } } diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java index 60de5c516b47..d0bd73b51d2e 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKAnalyzer.java @@ -86,7 +86,7 @@ public void testJa2() throws IOException { "", "" }, - new int[] {1, 1, 1, 1, 1, 1, 1, 1}); + new int[] {1, 1, 1, 2, 1, 1, 1, 2}); } public void testC() throws IOException { @@ -147,7 +147,7 @@ public void testFinalOffset() throws IOException { new int[] {0, 2}, new int[] {2, 6}, new String[] {"", ""}, - new int[] {1, 1}); + new int[] {1, 2}); assertAnalyzesTo( analyzer, @@ -177,7 +177,7 @@ public void testMix() throws IOException { "", "" }, - new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1}); + new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1}); } public void testMix2() throws IOException { @@ -200,7 +200,7 @@ public void testMix2() throws IOException { "", "" }, - new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2}); } /** Non-english text (outside of CJK) is treated normally, according to unicode rules */ @@ -256,7 +256,7 @@ public void testReusableTokenStream() throws IOException { "", "" }, - new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1}); + new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1}); assertAnalyzesTo( analyzer, @@ -277,7 +277,7 @@ public void testReusableTokenStream() throws IOException { "", "" }, - new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2}); } public void testSingleChar() throws IOException { diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java index 4e6499ed96fc..b5a04ae2ddb5 100644 --- a/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java +++ b/lucene/analysis/common/src/test/org/apache/lucene/analysis/cjk/TestCJKBigramFilter.java @@ -110,7 +110,7 @@ protected TokenStreamComponents createComponents(String fieldName) { "", "" }, - new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + new int[] {1, 1, 1, 1, 2, 1, 2, 1, 1, 1}, new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); a.close(); } @@ -217,6 +217,79 @@ public void testUnigramsAndBigramsHuge() throws Exception { }); } + /** + * Test that positions are consistent between outputUnigrams=true and outputUnigrams=false when a + * CJK segment is followed by a word break and then another token. See GITHUB#15812. + */ + public void testBigramPositionsConsistentAcrossWordBreak() throws Exception { + // "一二、三": bigram "一二" followed by punctuation break, then lone "三" + // With outputUnigrams=true: 一(pos0) 一二(pos0) 二(pos1) 三(pos2) + // With outputUnigrams=false: 一二(pos0) 三(pos2) — NOT pos1 + assertAnalyzesTo( + analyzer, + "一二、三", + new String[] {"一二", "三"}, + new int[] {0, 3}, + new int[] {2, 4}, + new String[] {"", ""}, + new int[] {1, 2}, + new int[] {1, 1}); + + // Also verify the outputUnigrams=true case for comparison + assertAnalyzesTo( + unibiAnalyzer, + "一二、三", + new String[] {"一", "一二", "二", "三"}, + new int[] {0, 0, 1, 3}, + new int[] {1, 2, 2, 4}, + new String[] {"", "", "", ""}, + new int[] {1, 0, 1, 1}, + new int[] {1, 2, 1, 1}); + } + + /** + * Test positions across multiple CJK segments separated by word breaks. Each segment's bigrams + * should account for spanning two positions, so subsequent tokens align correctly. + */ + public void testBigramPositionsMultipleSegments() throws Exception { + // "一二、三四、五" → segments [一二], [三四], [五] + assertAnalyzesTo( + analyzer, + "一二、三四、五", + new String[] {"一二", "三四", "五"}, + new int[] {0, 3, 6}, + new int[] {2, 5, 7}, + new String[] {"", "", ""}, + new int[] {1, 2, 2}, + new int[] {1, 1, 1}); + } + + /** + * Test positions when a CJK bigram segment is followed by non-CJK text. The non-CJK token should + * account for the bigram spanning two positions. + */ + public void testBigramPositionsBeforeNonCJK() throws Exception { + Analyzer a = + new Analyzer() { + @Override + protected TokenStreamComponents createComponents(String fieldName) { + Tokenizer t = new StandardTokenizer(); + return new TokenStreamComponents(t, new CJKBigramFilter(t, CJKBigramFilter.HAN)); + } + }; + // "学生abc" → HAN bigram "学生" then non-CJK "abc" + assertAnalyzesTo( + a, + "学生abc", + new String[] {"学生", "abc"}, + new int[] {0, 2}, + new int[] {2, 5}, + new String[] {"", ""}, + new int[] {1, 2}, + new int[] {1, 1}); + a.close(); + } + /** blast some random strings through the analyzer */ public void testRandomUnibiStrings() throws Exception { checkRandomData(random(), unibiAnalyzer, 200 * RANDOM_MULTIPLIER); diff --git a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java index 8fe60a45fea8..ab88dec8f2e3 100644 --- a/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java +++ b/lucene/analysis/icu/src/test/org/apache/lucene/analysis/icu/segmentation/TestWithCJKBigramFilter.java @@ -118,7 +118,7 @@ public void testJa2() throws IOException { "", "" }, - new int[] {1, 1, 1, 1, 1, 1, 1, 1}); + new int[] {1, 1, 1, 2, 1, 1, 1, 2}); } public void testC() throws IOException { @@ -179,7 +179,7 @@ public void testFinalOffset() throws IOException { new int[] {0, 2}, new int[] {2, 6}, new String[] {"", ""}, - new int[] {1, 1}); + new int[] {1, 2}); assertAnalyzesTo( analyzer, @@ -209,7 +209,7 @@ public void testMix() throws IOException { "", "" }, - new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1}); + new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1}); } public void testMix2() throws IOException { @@ -232,7 +232,7 @@ public void testMix2() throws IOException { "", "" }, - new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2}); } /** Non-english text (outside of CJK) is treated normally, according to unicode rules */ @@ -288,7 +288,7 @@ public void testReusableTokenStream() throws IOException { "", "" }, - new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1}); + new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1}); assertAnalyzesTo( analyzer, @@ -309,7 +309,7 @@ public void testReusableTokenStream() throws IOException { "", "" }, - new int[] {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}); + new int[] {1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2}); } public void testSingleChar() throws IOException {