Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion docs/src/format/index/scalar/fts.md
Original file line number Diff line number Diff line change
Expand Up @@ -81,15 +81,18 @@ The full text search index supports multiple tokenizer types for different text
| **raw** | No tokenization, treats entire text as single token | Exact matching |
| **ngram** | Breaks text into overlapping character sequences | Substring/fuzzy search |
| **icu** | ICU dictionary-based Unicode word segmentation | Mixed-language text |
| **icu/split** | ICU segmentation with simple-style delimiter splitting | Mixed-language identifiers |
| **jieba/*** | Chinese text tokenizer with word segmentation | Chinese text |
| **lindera/*** | Japanese text tokenizer with morphological analysis | Japanese text |

#### ICU Tokenizer (Mixed-language text)

The ICU tokenizer uses Unicode word boundary rules and dictionary-based segmentation for complex scripts. It is useful for mixed-language text where the default `simple` tokenizer would keep an unspaced CJK span as one large token.

By default, Lance preserves ICU word segments as returned by ICU. Use `base_tokenizer: "icu/split"` to split ICU word segments again on non-alphanumeric delimiters such as underscores and punctuation. For example, `hello_world こんにちは世界` is tokenized as `hello`, `world`, `こんにちは`, and `世界`.

- **Models**: Uses compiled ICU4X segmenter data bundled with Lance
- **Usage**: Specify as `icu`
- **Usage**: Specify as `icu`, or `icu/split` to split punctuation-delimited identifiers
- **Features**:
- Unicode-aware word boundary detection
- Dictionary-based segmentation for Chinese, Japanese, Khmer, Lao, Myanmar, and Thai
Expand Down
6 changes: 6 additions & 0 deletions docs/src/guide/tokenizer.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ ICU uses Unicode word boundary rules and bundled dictionary data for complex scr
ds.create_scalar_index("text", "INVERTED", base_tokenizer="icu")
```

Use `icu/split` when mixed-language text also contains punctuation-delimited identifiers that should be searchable by part.

```python
ds.create_scalar_index("text", "INVERTED", base_tokenizer="icu/split")
```

## Language Models of Jieba

### Downloading the Model
Expand Down
2 changes: 1 addition & 1 deletion docs/src/quickstart/full-text-search.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ ds.create_scalar_index(
index_type="INVERTED",
name="text_idx", # Optional index name (if omitted, default is "text_idx")
with_position=False, # Set True to enable phrase queries (stores token positions)
base_tokenizer="simple", # Tokenizer: "simple" (whitespace+punct), "icu", "whitespace", or "raw" (no tokenization)
base_tokenizer="simple", # Tokenizer: "simple", "icu", "icu/split", "whitespace", "raw", or "ngram"
language="English", # Language used for stemming + stop words (only used if `stem` or `remove_stop_words` is True)
max_token_length=40, # Drop tokens longer than this length
lower_case=True, # Lowercase text before tokenization
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ public static final class Builder {
* <li>{@code "whitespace"}: splits tokens on whitespace
* <li>{@code "raw"}: no tokenization
* <li>{@code "ngram"}: N-Gram tokenizer
* <li>{@code "icu"}: ICU dictionary-based Unicode word segmentation
* <li>{@code "icu/split"}: ICU segmentation with simple-style delimiter splitting
* <li>{@code "lindera/*"}: Lindera tokenizer
* <li>{@code "jieba/*"}: Jieba tokenizer
* </ul>
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.lance.index.scalar;

import org.junit.jupiter.api.Test;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertTrue;

public class InvertedIndexParamsTest {

@Test
public void testIcuSplitTokenizerVariant() {
ScalarIndexParams params = InvertedIndexParams.builder().baseTokenizer("icu/split").build();

assertEquals("inverted", params.getIndexType());
String jsonParams = params.getJsonParams().orElseThrow(AssertionError::new);
assertTrue(jsonParams.contains("\"base_tokenizer\":\"icu/split\""));
}
}
2 changes: 2 additions & 0 deletions python/python/lance/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3326,6 +3326,8 @@ def create_scalar_index(
* "simple": splits tokens on whitespace and punctuation.
* "whitespace": splits tokens on whitespace.
* "raw": no tokenization.
* "icu": ICU dictionary-based Unicode word segmentation.
* "icu/split": ICU segmentation with simple-style delimiter splitting.
language: str, default "English"
This is for the ``INVERTED`` index. The language for stemming
and stop words. This is only used when `stem` or `remove_stop_words` is true
Expand Down
49 changes: 49 additions & 0 deletions python/python/tests/test_scalar_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1869,6 +1869,55 @@ def test_icu_tokenizer(tmp_path):
assert results["_rowid"].to_pylist() == [0]


def test_icu_tokenizer_split_on_non_alphanumeric_default(tmp_path):
data = pa.table({"text": ["hello_world"]})
ds = lance.write_dataset(data, tmp_path, mode="overwrite")
ds.create_scalar_index(
"text",
"INVERTED",
base_tokenizer="icu",
stem=False,
remove_stop_words=False,
)

results = ds.to_table(full_text_query="hello", prefilter=True, with_row_id=True)
assert results.num_rows == 0

results = ds.to_table(
full_text_query="hello_world", prefilter=True, with_row_id=True
)
assert results["_rowid"].to_pylist() == [0]


def test_icu_tokenizer_split_on_non_alphanumeric(tmp_path):
data = pa.table(
{
"text": [
"hello_world こんにちは世界",
"alpha.beta",
],
}
)
ds = lance.write_dataset(data, tmp_path, mode="overwrite")
ds.create_scalar_index(
"text",
"INVERTED",
base_tokenizer="icu/split",
stem=False,
remove_stop_words=False,
)

for query, expected_row_ids in [
("hello", [0]),
("world", [0]),
("世界", [0]),
("alpha", [1]),
("beta", [1]),
]:
results = ds.to_table(full_text_query=query, prefilter=True, with_row_id=True)
assert results["_rowid"].to_pylist() == expected_row_ids


def test_jieba_invalid_user_dict_tokenizer(tmp_path):
set_language_model_path()
data = pa.table(
Expand Down
35 changes: 31 additions & 4 deletions rust/lance-index/src/scalar/inverted/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ pub struct InvertedIndexParams {
/// - `whitespace`: splits tokens on whitespace
/// - `raw`: no tokenization
/// - `icu`: ICU dictionary-based word segmentation
/// - `icu/split`: ICU segmentation with simple-style delimiter splitting
/// - `lindera/*`: Lindera tokenizer
/// - `jieba/*`: Jieba tokenizer
///
Expand Down Expand Up @@ -197,6 +198,7 @@ impl InvertedIndexParams {
/// - `raw`: no tokenization
/// - `ngram`: N-Gram tokenizer
/// - `icu`: ICU dictionary-based word segmentation
/// - `icu/split`: ICU segmentation with simple-style delimiter splitting
/// - `lindera/*`: Lindera tokenizer
/// - `jieba/*`: Jieba tokenizer
///
Expand Down Expand Up @@ -376,7 +378,9 @@ impl InvertedIndexParams {
fn stop_word_filter(&self) -> Result<StopWordFilter> {
match &self.custom_stop_words {
Some(words) => Ok(StopWordFilter::remove(words.iter().cloned())),
None if self.base_tokenizer == "icu" => Ok(StopWordFilter::all()),
None if self.base_tokenizer == "icu" || self.base_tokenizer == "icu/split" => {
Ok(StopWordFilter::all())
}
None => StopWordFilter::new(self.language).ok_or_else(|| {
Error::invalid_input(format!(
"removing stop words for language {:?} is not supported yet",
Expand All @@ -392,6 +396,9 @@ impl InvertedIndexParams {
"whitespace" => Ok(TextAnalyzer::builder(WhitespaceTokenizer::default()).dynamic()),
"raw" => Ok(TextAnalyzer::builder(RawTokenizer::default()).dynamic()),
"icu" => Ok(TextAnalyzer::builder(IcuTokenizer::default()).dynamic()),
"icu/split" => {
Ok(TextAnalyzer::builder(IcuTokenizer::default().with_simple_split()).dynamic())
}
"ngram" => {
let tokenizer = NgramTokenizer::new(
self.min_ngram_length as usize,
Expand Down Expand Up @@ -445,6 +452,7 @@ pub fn language_model_home() -> Option<PathBuf> {
mod tests {
use super::InvertedIndexParams;
use lance_tokenizer::TokenStream;
use rstest::rstest;

#[test]
fn test_build_only_fields_are_not_serialized() {
Expand Down Expand Up @@ -508,6 +516,23 @@ mod tests {
assert_eq!(tokens, vec!["hello", "こんにちは", "世界"]);
}

#[test]
fn test_build_icu_tokenizer_with_split_on_non_alphanumeric() {
let mut tokenizer = InvertedIndexParams::default()
.base_tokenizer("icu/split".to_string())
.stem(false)
.remove_stop_words(false)
.build()
.unwrap();
let mut stream = tokenizer.token_stream_for_doc("hello_world こんにちは世界 alpha.beta");
let mut tokens = Vec::new();
stream.process(&mut |token| tokens.push(token.text.clone()));
assert_eq!(
tokens,
vec!["hello", "world", "こんにちは", "世界", "alpha", "beta"]
);
}

#[test]
fn test_remove_stop_words_respects_language_for_non_icu_tokenizer() {
let mut tokenizer = InvertedIndexParams::default()
Expand Down Expand Up @@ -541,11 +566,13 @@ mod tests {
assert_eq!(tokens, vec!["the".to_string(), "data".to_string()]);
}

#[test]
fn test_icu_stop_words_use_all_builtin_lists() {
#[rstest]
#[case::icu("icu")]
#[case::icu_split("icu/split")]
fn test_icu_stop_words_use_all_builtin_lists(#[case] base_tokenizer: &str) {
let mut tokenizer = InvertedIndexParams::default()
.stem(false)
.base_tokenizer("icu".to_string())
.base_tokenizer(base_tokenizer.to_string())
.build()
.unwrap();
let mut stream = tokenizer.token_stream_for_search("the 的 lance data");
Expand Down
97 changes: 87 additions & 10 deletions rust/lance-tokenizer/src/icu.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,25 @@ use crate::{TextAnalyzer, TextAnalyzerBuilder, Token, TokenStream, Tokenizer};
#[derive(Clone)]
pub struct IcuTokenizer {
segmenter: WordSegmenterBorrowed<'static>,
split_on_non_alphanumeric: bool,
}

impl Default for IcuTokenizer {
fn default() -> Self {
Self {
segmenter: WordSegmenter::new_dictionary(WordBreakInvariantOptions::default()),
split_on_non_alphanumeric: false,
}
}
}

impl IcuTokenizer {
/// Split ICU word segments again on simple-tokenizer delimiters.
pub fn with_simple_split(mut self) -> Self {
self.split_on_non_alphanumeric = true;
self
}

pub fn analyzer(self) -> TextAnalyzer {
TextAnalyzer::builder(self).build()
}
Expand All @@ -33,6 +41,40 @@ pub struct IcuTokenStream {
index: usize,
}

fn push_token(tokens: &mut Vec<Token>, text: &str, offset_from: usize, offset_to: usize) {
if offset_from == offset_to {
return;
}

let token_text = &text[offset_from..offset_to];
if token_text.chars().any(char::is_alphanumeric) {
tokens.push(Token {
offset_from,
offset_to,
position: tokens.len(),
text: token_text.to_owned(),
position_length: 1,
});
}
}

fn push_tokens_split_on_non_alphanumeric(
tokens: &mut Vec<Token>,
text: &str,
offset_from: usize,
offset_to: usize,
) {
let mut part_start = offset_from;
for (relative_offset, c) in text[offset_from..offset_to].char_indices() {
if !c.is_alphanumeric() {
let delimiter_offset = offset_from + relative_offset;
push_token(tokens, text, part_start, delimiter_offset);
part_start = delimiter_offset + c.len_utf8();
}
}
push_token(tokens, text, part_start, offset_to);
}

impl TokenStream for IcuTokenStream {
fn advance(&mut self) -> bool {
if self.index < self.tokens.len() {
Expand Down Expand Up @@ -63,15 +105,10 @@ impl Tokenizer for IcuTokenizer {
};

for offset_to in boundaries {
let token_text = &text[offset_from..offset_to];
if token_text.chars().any(char::is_alphanumeric) {
tokens.push(Token {
offset_from,
offset_to,
position: tokens.len(),
text: token_text.to_owned(),
position_length: 1,
});
if self.split_on_non_alphanumeric {
push_tokens_split_on_non_alphanumeric(&mut tokens, text, offset_from, offset_to);
} else {
push_token(&mut tokens, text, offset_from, offset_to);
}
offset_from = offset_to;
}
Expand All @@ -84,14 +121,21 @@ impl Tokenizer for IcuTokenizer {
mod tests {
use crate::{IcuTokenizer, Token, TokenStream, Tokenizer};

fn collect_tokens(text: &str) -> Vec<Token> {
fn collect_tokens_with_split(text: &str, split_on_non_alphanumeric: bool) -> Vec<Token> {
let mut tokenizer = IcuTokenizer::default();
if split_on_non_alphanumeric {
tokenizer = tokenizer.with_simple_split();
}
let mut stream = tokenizer.token_stream(text);
let mut tokens = Vec::new();
stream.process(&mut |token| tokens.push(token.clone()));
tokens
}

fn collect_tokens(text: &str) -> Vec<Token> {
collect_tokens_with_split(text, false)
}

#[test]
fn test_icu_tokenizer_segments_mixed_text() {
let tokens = collect_tokens("Hello, こんにちは世界!");
Expand Down Expand Up @@ -124,4 +168,37 @@ mod tests {
vec!["Mark'd", "ye", "his", "words"]
);
}

#[test]
fn test_icu_tokenizer_splits_on_non_alphanumeric_when_enabled() {
let tokens = collect_tokens_with_split("foo_bar__baz-alpha.beta", true);

assert_eq!(
tokens
.iter()
.map(|token| token.text.as_str())
.collect::<Vec<_>>(),
vec!["foo", "bar", "baz", "alpha", "beta"]
);
assert_eq!(
tokens
.iter()
.map(|token| (token.offset_from, token.offset_to, token.position))
.collect::<Vec<_>>(),
vec![(0, 3, 0), (4, 7, 1), (9, 12, 2), (13, 18, 3), (19, 23, 4)]
);
}

#[test]
fn test_icu_tokenizer_split_control_keeps_icu_segmentation() {
let tokens = collect_tokens_with_split("hello_world こんにちは世界", true);

assert_eq!(
tokens
.iter()
.map(|token| token.text.as_str())
.collect::<Vec<_>>(),
vec!["hello", "world", "こんにちは", "世界"]
);
}
}
Loading