From 50773607ec31eab977e39755d2343d5097a9903a Mon Sep 17 00:00:00 2001
From: Xuanwo <github@xuanwo.io>
Date: Thu, 25 Jun 2026 16:03:56 +0800
Subject: [PATCH] feat: add ICU split tokenizer variant

---
 docs/src/format/index/scalar/fts.md           |  5 +-
 docs/src/guide/tokenizer.md                   |  6 ++
 docs/src/quickstart/full-text-search.md       |  2 +-
 .../index/scalar/InvertedIndexParams.java     |  2 +
 .../index/scalar/InvertedIndexParamsTest.java | 31 ++++++
 python/python/lance/dataset.py                |  2 +
 python/python/tests/test_scalar_index.py      | 49 ++++++++++
 .../src/scalar/inverted/tokenizer.rs          | 35 ++++++-
 rust/lance-tokenizer/src/icu.rs               | 97 +++++++++++++++++--
 9 files changed, 213 insertions(+), 16 deletions(-)
 create mode 100644 java/src/test/java/org/lance/index/scalar/InvertedIndexParamsTest.java

diff --git a/docs/src/format/index/scalar/fts.md b/docs/src/format/index/scalar/fts.md
index 792702aca08..adc7f94d65e 100644
--- a/docs/src/format/index/scalar/fts.md
+++ b/docs/src/format/index/scalar/fts.md
@@ -81,6 +81,7 @@ The full text search index supports multiple tokenizer types for different text
 | **raw**        | No tokenization, treats entire text as single token                       | Exact matching         |
 | **ngram**      | Breaks text into overlapping character sequences                          | Substring/fuzzy search |
 | **icu**        | ICU dictionary-based Unicode word segmentation                            | Mixed-language text    |
+| **icu/split**  | ICU segmentation with simple-style delimiter splitting                    | Mixed-language identifiers |
 | **jieba/***    | Chinese text tokenizer with word segmentation                             | Chinese text           |
 | **lindera/***  | Japanese text tokenizer with morphological analysis                       | Japanese text          |
 
@@ -88,8 +89,10 @@ The full text search index supports multiple tokenizer types for different text
 
 The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is useful for mixed-language text where the default `simple` tokenizer would keep an unspaced CJK span as one large token.
 
+By default, Lance preserves ICU word segments as returned by ICU. Use `base_tokenizer: "icu/split"` to split ICU word segments again on non-alphanumeric delimiters such as underscores and punctuation. For example, `hello_world こんにちは世界` is tokenized as `hello`, `world`, `こんにちは`, and `世界`.
+
 - **Models**: Uses compiled ICU4X segmenter data bundled with Lance
-- **Usage**: Specify as `icu`
+- **Usage**: Specify as `icu`, or `icu/split` to split punctuation-delimited identifiers
 - **Features**:
   - Unicode-aware word boundary detection
   - Dictionary-based segmentation for Chinese, Japanese, Khmer, Lao, Myanmar, and Thai
diff --git a/docs/src/guide/tokenizer.md b/docs/src/guide/tokenizer.md
index 192574656f5..eedac585fe9 100644
--- a/docs/src/guide/tokenizer.md
+++ b/docs/src/guide/tokenizer.md
@@ -20,6 +20,12 @@ ICU uses Unicode word boundary rules and bundled dictionary data for complex scr
 ds.create_scalar_index("text", "INVERTED", base_tokenizer="icu")
 ```
 
+Use `icu/split` when mixed-language text also contains punctuation-delimited identifiers that should be searchable by part.
+
+```python
+ds.create_scalar_index("text", "INVERTED", base_tokenizer="icu/split")
+```
+
 ## Language Models of Jieba
 
 ### Downloading the Model
diff --git a/docs/src/quickstart/full-text-search.md b/docs/src/quickstart/full-text-search.md
index 17327e40bc5..f990b2bd589 100644
--- a/docs/src/quickstart/full-text-search.md
+++ b/docs/src/quickstart/full-text-search.md
@@ -90,7 +90,7 @@ ds.create_scalar_index(
     index_type="INVERTED",
     name="text_idx",              # Optional index name (if omitted, default is "text_idx")
     with_position=False,          # Set True to enable phrase queries (stores token positions)
-    base_tokenizer="simple",      # Tokenizer: "simple" (whitespace+punct), "icu", "whitespace", or "raw" (no tokenization)
+    base_tokenizer="simple",      # Tokenizer: "simple", "icu", "icu/split", "whitespace", "raw", or "ngram"
     language="English",           # Language used for stemming + stop words (only used if `stem` or `remove_stop_words` is True)
     max_token_length=40,          # Drop tokens longer than this length
     lower_case=True,              # Lowercase text before tokenization
diff --git a/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java b/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java
index ca0a7a46c70..f509efd922b 100755
--- a/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java
+++ b/java/src/main/java/org/lance/index/scalar/InvertedIndexParams.java
@@ -65,6 +65,8 @@ public static final class Builder {
      *   <li>{@code "whitespace"}: splits tokens on whitespace
      *   <li>{@code "raw"}: no tokenization
      *   <li>{@code "ngram"}: N-Gram tokenizer
+     *   <li>{@code "icu"}: ICU dictionary-based Unicode word segmentation
+     *   <li>{@code "icu/split"}: ICU segmentation with simple-style delimiter splitting
      *   <li>{@code "lindera/*"}: Lindera tokenizer
      *   <li>{@code "jieba/*"}: Jieba tokenizer
      * </ul>
diff --git a/java/src/test/java/org/lance/index/scalar/InvertedIndexParamsTest.java b/java/src/test/java/org/lance/index/scalar/InvertedIndexParamsTest.java
new file mode 100644
index 00000000000..e5024a95c2a
--- /dev/null
+++ b/java/src/test/java/org/lance/index/scalar/InvertedIndexParamsTest.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.lance.index.scalar;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+public class InvertedIndexParamsTest {
+
+  @Test
+  public void testIcuSplitTokenizerVariant() {
+    ScalarIndexParams params = InvertedIndexParams.builder().baseTokenizer("icu/split").build();
+
+    assertEquals("inverted", params.getIndexType());
+    String jsonParams = params.getJsonParams().orElseThrow(AssertionError::new);
+    assertTrue(jsonParams.contains("\"base_tokenizer\":\"icu/split\""));
+  }
+}
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
index 57a8238132c..fda06eff260 100644
--- a/python/python/lance/dataset.py
+++ b/python/python/lance/dataset.py
@@ -3326,6 +3326,8 @@ def create_scalar_index(
             * "simple": splits tokens on whitespace and punctuation.
             * "whitespace": splits tokens on whitespace.
             * "raw": no tokenization.
+            * "icu": ICU dictionary-based Unicode word segmentation.
+            * "icu/split": ICU segmentation with simple-style delimiter splitting.
         language: str, default "English"
             This is for the ``INVERTED`` index. The language for stemming
             and stop words. This is only used when `stem` or `remove_stop_words` is true
diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
index b6e882633f5..4450f6821cc 100644
--- a/python/python/tests/test_scalar_index.py
+++ b/python/python/tests/test_scalar_index.py
@@ -1869,6 +1869,55 @@ def test_icu_tokenizer(tmp_path):
     assert results["_rowid"].to_pylist() == [0]
 
 
+def test_icu_tokenizer_split_on_non_alphanumeric_default(tmp_path):
+    data = pa.table({"text": ["hello_world"]})
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    ds.create_scalar_index(
+        "text",
+        "INVERTED",
+        base_tokenizer="icu",
+        stem=False,
+        remove_stop_words=False,
+    )
+
+    results = ds.to_table(full_text_query="hello", prefilter=True, with_row_id=True)
+    assert results.num_rows == 0
+
+    results = ds.to_table(
+        full_text_query="hello_world", prefilter=True, with_row_id=True
+    )
+    assert results["_rowid"].to_pylist() == [0]
+
+
+def test_icu_tokenizer_split_on_non_alphanumeric(tmp_path):
+    data = pa.table(
+        {
+            "text": [
+                "hello_world こんにちは世界",
+                "alpha.beta",
+            ],
+        }
+    )
+    ds = lance.write_dataset(data, tmp_path, mode="overwrite")
+    ds.create_scalar_index(
+        "text",
+        "INVERTED",
+        base_tokenizer="icu/split",
+        stem=False,
+        remove_stop_words=False,
+    )
+
+    for query, expected_row_ids in [
+        ("hello", [0]),
+        ("world", [0]),
+        ("世界", [0]),
+        ("alpha", [1]),
+        ("beta", [1]),
+    ]:
+        results = ds.to_table(full_text_query=query, prefilter=True, with_row_id=True)
+        assert results["_rowid"].to_pylist() == expected_row_ids
+
+
 def test_jieba_invalid_user_dict_tokenizer(tmp_path):
     set_language_model_path()
     data = pa.table(
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs
index 5a2a701dc73..1785a23fedd 100644
--- a/rust/lance-index/src/scalar/inverted/tokenizer.rs
+++ b/rust/lance-index/src/scalar/inverted/tokenizer.rs
@@ -42,6 +42,7 @@ pub struct InvertedIndexParams {
     /// - `whitespace`: splits tokens on whitespace
     /// - `raw`: no tokenization
     /// - `icu`: ICU dictionary-based word segmentation
+    /// - `icu/split`: ICU segmentation with simple-style delimiter splitting
     /// - `lindera/*`: Lindera tokenizer
     /// - `jieba/*`: Jieba tokenizer
     ///
@@ -197,6 +198,7 @@ impl InvertedIndexParams {
     /// - `raw`: no tokenization
     /// - `ngram`: N-Gram tokenizer
     /// - `icu`: ICU dictionary-based word segmentation
+    /// - `icu/split`: ICU segmentation with simple-style delimiter splitting
     /// - `lindera/*`: Lindera tokenizer
     /// - `jieba/*`: Jieba tokenizer
     ///
@@ -376,7 +378,9 @@ impl InvertedIndexParams {
     fn stop_word_filter(&self) -> Result<StopWordFilter> {
         match &self.custom_stop_words {
             Some(words) => Ok(StopWordFilter::remove(words.iter().cloned())),
-            None if self.base_tokenizer == "icu" => Ok(StopWordFilter::all()),
+            None if self.base_tokenizer == "icu" || self.base_tokenizer == "icu/split" => {
+                Ok(StopWordFilter::all())
+            }
             None => StopWordFilter::new(self.language).ok_or_else(|| {
                 Error::invalid_input(format!(
                     "removing stop words for language {:?} is not supported yet",
@@ -392,6 +396,9 @@ impl InvertedIndexParams {
             "whitespace" => Ok(TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()),
             "raw" => Ok(TextAnalyzer::builder(RawTokenizer::default()).dynamic()),
             "icu" => Ok(TextAnalyzer::builder(IcuTokenizer::default()).dynamic()),
+            "icu/split" => {
+                Ok(TextAnalyzer::builder(IcuTokenizer::default().with_simple_split()).dynamic())
+            }
             "ngram" => {
                 let tokenizer = NgramTokenizer::new(
                     self.min_ngram_length as usize,
@@ -445,6 +452,7 @@ pub fn language_model_home() -> Option<PathBuf> {
 mod tests {
     use super::InvertedIndexParams;
     use lance_tokenizer::TokenStream;
+    use rstest::rstest;
 
     #[test]
     fn test_build_only_fields_are_not_serialized() {
@@ -508,6 +516,23 @@ mod tests {
         assert_eq!(tokens, vec!["hello", "こんにちは", "世界"]);
     }
 
+    #[test]
+    fn test_build_icu_tokenizer_with_split_on_non_alphanumeric() {
+        let mut tokenizer = InvertedIndexParams::default()
+            .base_tokenizer("icu/split".to_string())
+            .stem(false)
+            .remove_stop_words(false)
+            .build()
+            .unwrap();
+        let mut stream = tokenizer.token_stream_for_doc("hello_world こんにちは世界 alpha.beta");
+        let mut tokens = Vec::new();
+        stream.process(&mut |token| tokens.push(token.text.clone()));
+        assert_eq!(
+            tokens,
+            vec!["hello", "world", "こんにちは", "世界", "alpha", "beta"]
+        );
+    }
+
     #[test]
     fn test_remove_stop_words_respects_language_for_non_icu_tokenizer() {
         let mut tokenizer = InvertedIndexParams::default()
@@ -541,11 +566,13 @@ mod tests {
         assert_eq!(tokens, vec!["the".to_string(), "data".to_string()]);
     }
 
-    #[test]
-    fn test_icu_stop_words_use_all_builtin_lists() {
+    #[rstest]
+    #[case::icu("icu")]
+    #[case::icu_split("icu/split")]
+    fn test_icu_stop_words_use_all_builtin_lists(#[case] base_tokenizer: &str) {
         let mut tokenizer = InvertedIndexParams::default()
             .stem(false)
-            .base_tokenizer("icu".to_string())
+            .base_tokenizer(base_tokenizer.to_string())
             .build()
             .unwrap();
         let mut stream = tokenizer.token_stream_for_search("the 的 lance data");
diff --git a/rust/lance-tokenizer/src/icu.rs b/rust/lance-tokenizer/src/icu.rs
index 4e36f23115f..eb9cd874c76 100644
--- a/rust/lance-tokenizer/src/icu.rs
+++ b/rust/lance-tokenizer/src/icu.rs
@@ -8,17 +8,25 @@ use crate::{TextAnalyzer, TextAnalyzerBuilder, Token, TokenStream, Tokenizer};
 #[derive(Clone)]
 pub struct IcuTokenizer {
     segmenter: WordSegmenterBorrowed<'static>,
+    split_on_non_alphanumeric: bool,
 }
 
 impl Default for IcuTokenizer {
     fn default() -> Self {
         Self {
             segmenter: WordSegmenter::new_dictionary(WordBreakInvariantOptions::default()),
+            split_on_non_alphanumeric: false,
         }
     }
 }
 
 impl IcuTokenizer {
+    /// Split ICU word segments again on simple-tokenizer delimiters.
+    pub fn with_simple_split(mut self) -> Self {
+        self.split_on_non_alphanumeric = true;
+        self
+    }
+
     pub fn analyzer(self) -> TextAnalyzer {
         TextAnalyzer::builder(self).build()
     }
@@ -33,6 +41,40 @@ pub struct IcuTokenStream {
     index: usize,
 }
 
+fn push_token(tokens: &mut Vec<Token>, text: &str, offset_from: usize, offset_to: usize) {
+    if offset_from == offset_to {
+        return;
+    }
+
+    let token_text = &text[offset_from..offset_to];
+    if token_text.chars().any(char::is_alphanumeric) {
+        tokens.push(Token {
+            offset_from,
+            offset_to,
+            position: tokens.len(),
+            text: token_text.to_owned(),
+            position_length: 1,
+        });
+    }
+}
+
+fn push_tokens_split_on_non_alphanumeric(
+    tokens: &mut Vec<Token>,
+    text: &str,
+    offset_from: usize,
+    offset_to: usize,
+) {
+    let mut part_start = offset_from;
+    for (relative_offset, c) in text[offset_from..offset_to].char_indices() {
+        if !c.is_alphanumeric() {
+            let delimiter_offset = offset_from + relative_offset;
+            push_token(tokens, text, part_start, delimiter_offset);
+            part_start = delimiter_offset + c.len_utf8();
+        }
+    }
+    push_token(tokens, text, part_start, offset_to);
+}
+
 impl TokenStream for IcuTokenStream {
     fn advance(&mut self) -> bool {
         if self.index < self.tokens.len() {
@@ -63,15 +105,10 @@ impl Tokenizer for IcuTokenizer {
         };
 
         for offset_to in boundaries {
-            let token_text = &text[offset_from..offset_to];
-            if token_text.chars().any(char::is_alphanumeric) {
-                tokens.push(Token {
-                    offset_from,
-                    offset_to,
-                    position: tokens.len(),
-                    text: token_text.to_owned(),
-                    position_length: 1,
-                });
+            if self.split_on_non_alphanumeric {
+                push_tokens_split_on_non_alphanumeric(&mut tokens, text, offset_from, offset_to);
+            } else {
+                push_token(&mut tokens, text, offset_from, offset_to);
             }
             offset_from = offset_to;
         }
@@ -84,14 +121,21 @@ impl Tokenizer for IcuTokenizer {
 mod tests {
     use crate::{IcuTokenizer, Token, TokenStream, Tokenizer};
 
-    fn collect_tokens(text: &str) -> Vec<Token> {
+    fn collect_tokens_with_split(text: &str, split_on_non_alphanumeric: bool) -> Vec<Token> {
         let mut tokenizer = IcuTokenizer::default();
+        if split_on_non_alphanumeric {
+            tokenizer = tokenizer.with_simple_split();
+        }
         let mut stream = tokenizer.token_stream(text);
         let mut tokens = Vec::new();
         stream.process(&mut |token| tokens.push(token.clone()));
         tokens
     }
 
+    fn collect_tokens(text: &str) -> Vec<Token> {
+        collect_tokens_with_split(text, false)
+    }
+
     #[test]
     fn test_icu_tokenizer_segments_mixed_text() {
         let tokens = collect_tokens("Hello, こんにちは世界!");
@@ -124,4 +168,37 @@ mod tests {
             vec!["Mark'd", "ye", "his", "words"]
         );
     }
+
+    #[test]
+    fn test_icu_tokenizer_splits_on_non_alphanumeric_when_enabled() {
+        let tokens = collect_tokens_with_split("foo_bar__baz-alpha.beta", true);
+
+        assert_eq!(
+            tokens
+                .iter()
+                .map(|token| token.text.as_str())
+                .collect::<Vec<_>>(),
+            vec!["foo", "bar", "baz", "alpha", "beta"]
+        );
+        assert_eq!(
+            tokens
+                .iter()
+                .map(|token| (token.offset_from, token.offset_to, token.position))
+                .collect::<Vec<_>>(),
+            vec![(0, 3, 0), (4, 7, 1), (9, 12, 2), (13, 18, 3), (19, 23, 4)]
+        );
+    }
+
+    #[test]
+    fn test_icu_tokenizer_split_control_keeps_icu_segmentation() {
+        let tokens = collect_tokens_with_split("hello_world こんにちは世界", true);
+
+        assert_eq!(
+            tokens
+                .iter()
+                .map(|token| token.text.as_str())
+                .collect::<Vec<_>>(),
+            vec!["hello", "world", "こんにちは", "世界"]
+        );
+    }
 }