From 50773607ec31eab977e39755d2343d5097a9903a Mon Sep 17 00:00:00 2001 From: Xuanwo Date: Thu, 25 Jun 2026 16:03:56 +0800 Subject: [PATCH] feat: add ICU split tokenizer variant --- docs/src/format/index/scalar/fts.md | 5 +- docs/src/guide/tokenizer.md | 6 ++ docs/src/quickstart/full-text-search.md | 2 +- .../index/scalar/InvertedIndexParams.java | 2 + .../index/scalar/InvertedIndexParamsTest.java | 31 ++++++ python/python/lance/dataset.py | 2 + python/python/tests/test_scalar_index.py | 49 ++++++++++ .../src/scalar/inverted/tokenizer.rs | 35 ++++++- rust/lance-tokenizer/src/icu.rs | 97 +++++++++++++++++-- 9 files changed, 213 insertions(+), 16 deletions(-) create mode 100644 java/src/test/java/org/lance/index/scalar/InvertedIndexParamsTest.java diff --git a/docs/src/format/index/scalar/fts.md b/docs/src/format/index/scalar/fts.md index 792702aca08..adc7f94d65e 100644 --- a/docs/src/format/index/scalar/fts.md +++ b/docs/src/format/index/scalar/fts.md @@ -81,6 +81,7 @@ The full text search index supports multiple tokenizer types for different text | **raw** | No tokenization, treats entire text as single token | Exact matching | | **ngram** | Breaks text into overlapping character sequences | Substring/fuzzy search | | **icu** | ICU dictionary-based Unicode word segmentation | Mixed-language text | +| **icu/split** | ICU segmentation with simple-style delimiter splitting | Mixed-language identifiers | | **jieba/*** | Chinese text tokenizer with word segmentation | Chinese text | | **lindera/*** | Japanese text tokenizer with morphological analysis | Japanese text | @@ -88,8 +89,10 @@ The full text search index supports multiple tokenizer types for different text The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is useful for mixed-language text where the default `simple` tokenizer would keep an unspaced CJK span as one large token. +By default, Lance preserves ICU word segments as returned by ICU. Use `base_tokenizer: "icu/split"` to split ICU word segments again on non-alphanumeric delimiters such as underscores and punctuation. For example, `hello_world こんにちは世界` is tokenized as `hello`, `world`, `こんにちは`, and `世界`. + - **Models**: Uses compiled ICU4X segmenter data bundled with Lance -- **Usage**: Specify as `icu` +- **Usage**: Specify as `icu`, or `icu/split` to split punctuation-delimited identifiers - **Features**: - Unicode-aware word boundary detection - Dictionary-based segmentation for Chinese, Japanese, Khmer, Lao, Myanmar, and Thai diff --git a/docs/src/guide/tokenizer.md b/docs/src/guide/tokenizer.md index 192574656f5..eedac585fe9 100644 --- a/docs/src/guide/tokenizer.md +++ b/docs/src/guide/tokenizer.md @@ -20,6 +20,12 @@ ICU uses Unicode word boundary rules and bundled dictionary data for complex scr ds.create_scalar_index("text", "INVERTED", base_tokenizer="icu") ``` +Use `icu/split` when mixed-language text also contains punctuation-delimited identifiers that should be searchable by part. + +```python +ds.create_scalar_index("text", "INVERTED", base_tokenizer="icu/split") +``` + ## Language Models of Jieba ### Downloading the Model diff --git a/docs/src/quickstart/full-text-search.md b/docs/src/quickstart/full-text-search.md index 17327e40bc5..f990b2bd589 100644 --- a/docs/src/quickstart/full-text-search.md +++ b/docs/src/quickstart/full-text-search.md @@ -90,7 +90,7 @@ ds.create_scalar_index( index_type="INVERTED", name="text_idx", # Optional index name (if omitted, default is "text_idx") with_position=False, # Set True to enable phrase queries (stores token positions) - base_tokenizer="simple", # Tokenizer: "simple" (whitespace+punct), "icu", "whitespace", or "raw" (no tokenization) + base_tokenizer="simple", # Tokenizer: "simple", "icu", "icu/split", "whitespace", "raw", or "ngram" language="English", # Language used for stemming + stop words (only used if `stem` or `remove_stop_words` is True) max_token_length=40, # Drop tokens longer than this length lower_case=True, # Lowercase text before tokenization diff --git a/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java b/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java index ca0a7a46c70..f509efd922b 100755 --- a/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java +++ b/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java @@ -65,6 +65,8 @@ public static final class Builder { *
  • {@code "whitespace"}: splits tokens on whitespace *
  • {@code "raw"}: no tokenization *
  • {@code "ngram"}: N-Gram tokenizer + *
  • {@code "icu"}: ICU dictionary-based Unicode word segmentation + *
  • {@code "icu/split"}: ICU segmentation with simple-style delimiter splitting *
  • {@code "lindera/*"}: Lindera tokenizer *
  • {@code "jieba/*"}: Jieba tokenizer * diff --git a/java/src/test/java/org/lance/index/scalar/InvertedIndexParamsTest.java b/java/src/test/java/org/lance/index/scalar/InvertedIndexParamsTest.java new file mode 100644 index 00000000000..e5024a95c2a --- /dev/null +++ b/java/src/test/java/org/lance/index/scalar/InvertedIndexParamsTest.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.lance.index.scalar; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +public class InvertedIndexParamsTest { + + @Test + public void testIcuSplitTokenizerVariant() { + ScalarIndexParams params = InvertedIndexParams.builder().baseTokenizer("icu/split").build(); + + assertEquals("inverted", params.getIndexType()); + String jsonParams = params.getJsonParams().orElseThrow(AssertionError::new); + assertTrue(jsonParams.contains("\"base_tokenizer\":\"icu/split\"")); + } +} diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py index 57a8238132c..fda06eff260 100644 --- a/python/python/lance/dataset.py +++ b/python/python/lance/dataset.py @@ -3326,6 +3326,8 @@ def create_scalar_index( * "simple": splits tokens on whitespace and punctuation. * "whitespace": splits tokens on whitespace. * "raw": no tokenization. + * "icu": ICU dictionary-based Unicode word segmentation. + * "icu/split": ICU segmentation with simple-style delimiter splitting. language: str, default "English" This is for the ``INVERTED`` index. The language for stemming and stop words. This is only used when `stem` or `remove_stop_words` is true diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py index b6e882633f5..4450f6821cc 100644 --- a/python/python/tests/test_scalar_index.py +++ b/python/python/tests/test_scalar_index.py @@ -1869,6 +1869,55 @@ def test_icu_tokenizer(tmp_path): assert results["_rowid"].to_pylist() == [0] +def test_icu_tokenizer_split_on_non_alphanumeric_default(tmp_path): + data = pa.table({"text": ["hello_world"]}) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index( + "text", + "INVERTED", + base_tokenizer="icu", + stem=False, + remove_stop_words=False, + ) + + results = ds.to_table(full_text_query="hello", prefilter=True, with_row_id=True) + assert results.num_rows == 0 + + results = ds.to_table( + full_text_query="hello_world", prefilter=True, with_row_id=True + ) + assert results["_rowid"].to_pylist() == [0] + + +def test_icu_tokenizer_split_on_non_alphanumeric(tmp_path): + data = pa.table( + { + "text": [ + "hello_world こんにちは世界", + "alpha.beta", + ], + } + ) + ds = lance.write_dataset(data, tmp_path, mode="overwrite") + ds.create_scalar_index( + "text", + "INVERTED", + base_tokenizer="icu/split", + stem=False, + remove_stop_words=False, + ) + + for query, expected_row_ids in [ + ("hello", [0]), + ("world", [0]), + ("世界", [0]), + ("alpha", [1]), + ("beta", [1]), + ]: + results = ds.to_table(full_text_query=query, prefilter=True, with_row_id=True) + assert results["_rowid"].to_pylist() == expected_row_ids + + def test_jieba_invalid_user_dict_tokenizer(tmp_path): set_language_model_path() data = pa.table( diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs index 5a2a701dc73..1785a23fedd 100644 --- a/rust/lance-index/src/scalar/inverted/tokenizer.rs +++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs @@ -42,6 +42,7 @@ pub struct InvertedIndexParams { /// - `whitespace`: splits tokens on whitespace /// - `raw`: no tokenization /// - `icu`: ICU dictionary-based word segmentation + /// - `icu/split`: ICU segmentation with simple-style delimiter splitting /// - `lindera/*`: Lindera tokenizer /// - `jieba/*`: Jieba tokenizer /// @@ -197,6 +198,7 @@ impl InvertedIndexParams { /// - `raw`: no tokenization /// - `ngram`: N-Gram tokenizer /// - `icu`: ICU dictionary-based word segmentation + /// - `icu/split`: ICU segmentation with simple-style delimiter splitting /// - `lindera/*`: Lindera tokenizer /// - `jieba/*`: Jieba tokenizer /// @@ -376,7 +378,9 @@ impl InvertedIndexParams { fn stop_word_filter(&self) -> Result { match &self.custom_stop_words { Some(words) => Ok(StopWordFilter::remove(words.iter().cloned())), - None if self.base_tokenizer == "icu" => Ok(StopWordFilter::all()), + None if self.base_tokenizer == "icu" || self.base_tokenizer == "icu/split" => { + Ok(StopWordFilter::all()) + } None => StopWordFilter::new(self.language).ok_or_else(|| { Error::invalid_input(format!( "removing stop words for language {:?} is not supported yet", @@ -392,6 +396,9 @@ impl InvertedIndexParams { "whitespace" => Ok(TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()), "raw" => Ok(TextAnalyzer::builder(RawTokenizer::default()).dynamic()), "icu" => Ok(TextAnalyzer::builder(IcuTokenizer::default()).dynamic()), + "icu/split" => { + Ok(TextAnalyzer::builder(IcuTokenizer::default().with_simple_split()).dynamic()) + } "ngram" => { let tokenizer = NgramTokenizer::new( self.min_ngram_length as usize, @@ -445,6 +452,7 @@ pub fn language_model_home() -> Option { mod tests { use super::InvertedIndexParams; use lance_tokenizer::TokenStream; + use rstest::rstest; #[test] fn test_build_only_fields_are_not_serialized() { @@ -508,6 +516,23 @@ mod tests { assert_eq!(tokens, vec!["hello", "こんにちは", "世界"]); } + #[test] + fn test_build_icu_tokenizer_with_split_on_non_alphanumeric() { + let mut tokenizer = InvertedIndexParams::default() + .base_tokenizer("icu/split".to_string()) + .stem(false) + .remove_stop_words(false) + .build() + .unwrap(); + let mut stream = tokenizer.token_stream_for_doc("hello_world こんにちは世界 alpha.beta"); + let mut tokens = Vec::new(); + stream.process(&mut |token| tokens.push(token.text.clone())); + assert_eq!( + tokens, + vec!["hello", "world", "こんにちは", "世界", "alpha", "beta"] + ); + } + #[test] fn test_remove_stop_words_respects_language_for_non_icu_tokenizer() { let mut tokenizer = InvertedIndexParams::default() @@ -541,11 +566,13 @@ mod tests { assert_eq!(tokens, vec!["the".to_string(), "data".to_string()]); } - #[test] - fn test_icu_stop_words_use_all_builtin_lists() { + #[rstest] + #[case::icu("icu")] + #[case::icu_split("icu/split")] + fn test_icu_stop_words_use_all_builtin_lists(#[case] base_tokenizer: &str) { let mut tokenizer = InvertedIndexParams::default() .stem(false) - .base_tokenizer("icu".to_string()) + .base_tokenizer(base_tokenizer.to_string()) .build() .unwrap(); let mut stream = tokenizer.token_stream_for_search("the 的 lance data"); diff --git a/rust/lance-tokenizer/src/icu.rs b/rust/lance-tokenizer/src/icu.rs index 4e36f23115f..eb9cd874c76 100644 --- a/rust/lance-tokenizer/src/icu.rs +++ b/rust/lance-tokenizer/src/icu.rs @@ -8,17 +8,25 @@ use crate::{TextAnalyzer, TextAnalyzerBuilder, Token, TokenStream, Tokenizer}; #[derive(Clone)] pub struct IcuTokenizer { segmenter: WordSegmenterBorrowed<'static>, + split_on_non_alphanumeric: bool, } impl Default for IcuTokenizer { fn default() -> Self { Self { segmenter: WordSegmenter::new_dictionary(WordBreakInvariantOptions::default()), + split_on_non_alphanumeric: false, } } } impl IcuTokenizer { + /// Split ICU word segments again on simple-tokenizer delimiters. + pub fn with_simple_split(mut self) -> Self { + self.split_on_non_alphanumeric = true; + self + } + pub fn analyzer(self) -> TextAnalyzer { TextAnalyzer::builder(self).build() } @@ -33,6 +41,40 @@ pub struct IcuTokenStream { index: usize, } +fn push_token(tokens: &mut Vec, text: &str, offset_from: usize, offset_to: usize) { + if offset_from == offset_to { + return; + } + + let token_text = &text[offset_from..offset_to]; + if token_text.chars().any(char::is_alphanumeric) { + tokens.push(Token { + offset_from, + offset_to, + position: tokens.len(), + text: token_text.to_owned(), + position_length: 1, + }); + } +} + +fn push_tokens_split_on_non_alphanumeric( + tokens: &mut Vec, + text: &str, + offset_from: usize, + offset_to: usize, +) { + let mut part_start = offset_from; + for (relative_offset, c) in text[offset_from..offset_to].char_indices() { + if !c.is_alphanumeric() { + let delimiter_offset = offset_from + relative_offset; + push_token(tokens, text, part_start, delimiter_offset); + part_start = delimiter_offset + c.len_utf8(); + } + } + push_token(tokens, text, part_start, offset_to); +} + impl TokenStream for IcuTokenStream { fn advance(&mut self) -> bool { if self.index < self.tokens.len() { @@ -63,15 +105,10 @@ impl Tokenizer for IcuTokenizer { }; for offset_to in boundaries { - let token_text = &text[offset_from..offset_to]; - if token_text.chars().any(char::is_alphanumeric) { - tokens.push(Token { - offset_from, - offset_to, - position: tokens.len(), - text: token_text.to_owned(), - position_length: 1, - }); + if self.split_on_non_alphanumeric { + push_tokens_split_on_non_alphanumeric(&mut tokens, text, offset_from, offset_to); + } else { + push_token(&mut tokens, text, offset_from, offset_to); } offset_from = offset_to; } @@ -84,14 +121,21 @@ impl Tokenizer for IcuTokenizer { mod tests { use crate::{IcuTokenizer, Token, TokenStream, Tokenizer}; - fn collect_tokens(text: &str) -> Vec { + fn collect_tokens_with_split(text: &str, split_on_non_alphanumeric: bool) -> Vec { let mut tokenizer = IcuTokenizer::default(); + if split_on_non_alphanumeric { + tokenizer = tokenizer.with_simple_split(); + } let mut stream = tokenizer.token_stream(text); let mut tokens = Vec::new(); stream.process(&mut |token| tokens.push(token.clone())); tokens } + fn collect_tokens(text: &str) -> Vec { + collect_tokens_with_split(text, false) + } + #[test] fn test_icu_tokenizer_segments_mixed_text() { let tokens = collect_tokens("Hello, こんにちは世界!"); @@ -124,4 +168,37 @@ mod tests { vec!["Mark'd", "ye", "his", "words"] ); } + + #[test] + fn test_icu_tokenizer_splits_on_non_alphanumeric_when_enabled() { + let tokens = collect_tokens_with_split("foo_bar__baz-alpha.beta", true); + + assert_eq!( + tokens + .iter() + .map(|token| token.text.as_str()) + .collect::>(), + vec!["foo", "bar", "baz", "alpha", "beta"] + ); + assert_eq!( + tokens + .iter() + .map(|token| (token.offset_from, token.offset_to, token.position)) + .collect::>(), + vec![(0, 3, 0), (4, 7, 1), (9, 12, 2), (13, 18, 3), (19, 23, 4)] + ); + } + + #[test] + fn test_icu_tokenizer_split_control_keeps_icu_segmentation() { + let tokens = collect_tokens_with_split("hello_world こんにちは世界", true); + + assert_eq!( + tokens + .iter() + .map(|token| token.text.as_str()) + .collect::>(), + vec!["hello", "world", "こんにちは", "世界"] + ); + } }