Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 13 additions & 1 deletion protos/index_old.proto

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we are going to touch this file. I'm thinking we need a better name for it.

Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,19 @@ message BTreeIndexDetails {}
message BitmapIndexDetails {}
message LabelListIndexDetails {}
message NGramIndexDetails {}
message ZoneMapIndexDetails {}
message ZoneMapIndexDetails {
// Number of rows per zone. Optional for backwards compatibility: absent on
// datasets written before this field was added. When absent, no seed writer
// is created for the index.
optional uint64 rows_per_zone = 1;

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We will need a vote for this change.

// Whether seed-based incremental updates are enabled for this index.
// Absent (old datasets) or false: seeds disabled.
// true: seeds enabled; the index will embed per-fragment seed buffers in
// data files and harvest them during incremental updates to skip full scans.
// Defaults to true for variable-length column types (strings, binary) and
// fixed-width types wider than 8 bytes when not explicitly set by the user.
optional bool use_seeds = 2;
}
message InvertedIndexDetails {
// Marking this field as optional as old versions of the index store blank details and we
// need to make sure we have a proper optional field to detect this.
Expand Down
1 change: 1 addition & 0 deletions rust/lance-index/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ rust-version.workspace = true
arc-swap.workspace = true
arrow.workspace = true
arrow-array.workspace = true
arrow-ipc.workspace = true
arrow-ord.workspace = true
arrow-schema.workspace = true
arrow-select.workspace = true
Expand Down
1 change: 1 addition & 0 deletions rust/lance-index/src/scalar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ pub mod ngram;
pub mod registry;
#[cfg(feature = "geo")]
pub mod rtree;
pub mod seed;
pub mod zoned;
pub mod zonemap;

Expand Down
51 changes: 50 additions & 1 deletion rust/lance-index/src/scalar/registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
use std::borrow::Cow;
use std::sync::Arc;

use arrow_schema::Field;
use arrow_schema::{DataType, Field};
use async_trait::async_trait;
use datafusion::execution::SendableRecordBatchStream;
use lance_core::{
Expand Down Expand Up @@ -217,6 +217,55 @@ pub trait ScalarIndexPlugin: Send + Sync + std::fmt::Debug {
// Return an empty JSON object as the default implementation
Ok(serde_json::json!({}))
}

/// Optionally create a seed writer for the given column.
///
/// A seed writer observes column values during data file writes, accumulates
/// compact statistics in memory, and serializes them as a global buffer
/// embedded in the data file footer. The buffer is later harvested during
/// index updates to skip a full column scan.
///
/// All parameters needed to construct the writer must be derivable from
/// `index_details` — this method must not perform any I/O. Return `Ok(None)`
/// if this index type does not support seed writing.
async fn create_seed_writer(
&self,
_field_path: &str,
_data_type: &DataType,
_index_details: &prost_types::Any,
) -> Result<Option<Box<dyn super::seed::IndexSeedWriter>>> {
Ok(None)
}

/// Returns true if this index type may have seed buffers embedded in data
/// files for the given index configuration.
///
/// When false the caller can skip opening data files to look for seeds
/// entirely, avoiding I/O for index types or configurations that never
/// write seeds.
fn might_use_seeds(&self, _index_details: &prost_types::Any) -> bool {
false
}

/// Attempt to update `reference_index` using pre-harvested `seeds` instead
/// of re-scanning column data.
///
/// Each [`FragmentSeed`](super::seed::FragmentSeed) carries the raw bytes
/// written by the corresponding [`IndexSeedWriter`](super::seed::IndexSeedWriter)
/// and the original `metadata_value` stored in the data file, which the plugin
/// can use for compatibility validation (e.g. confirming `rows_per_zone`).
///
/// Return `Ok(Some(created))` if the seed-based update succeeded, or
/// `Ok(None)` to signal that the caller should fall back to a full column scan.
async fn update_from_seeds(
&self,
_seeds: Vec<super::seed::FragmentSeed>,
_reference_index: Arc<dyn ScalarIndex>,
_index_details: &prost_types::Any,
_dest_store: &dyn IndexStore,
) -> Result<Option<CreatedIndex>> {
Ok(None)
}
}

/// In-memory cache key for a whole `Arc<dyn ScalarIndex>`.
Expand Down
59 changes: 59 additions & 0 deletions rust/lance-index/src/scalar/seed.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

//! Index seed writers — compact per-fragment summaries embedded in data files.
//!
//! A seed writer observes column values as they are written to a data file,
//! accumulates compact statistics in memory, and serializes them to a byte

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At the moment we assume the entire seed can comfortably fit in memory but in theory we could split it into multiple global buffers in the future if we needed to.

//! buffer that is embedded in the data file footer as a global buffer.
//!
//! The buffer can later be read back during index updates to reconstruct index
//! statistics without re-scanning the column data.

use arrow_array::ArrayRef;
use bytes::Bytes;
use lance_core::Result;

/// Schema metadata key prefix for all seed buffers: `"lance.seed.<column_name>"`.
pub const SEED_META_KEY_PREFIX: &str = "lance.seed.";

/// A hook registered during data file writes that observes column values batch
/// by batch, accumulates compact statistics in memory, and serializes them to
/// a byte buffer that is embedded in the data file footer as a global buffer.
///
/// The buffer can later be read back during index updates to reconstruct index
/// statistics without re-scanning the column data.
pub trait IndexSeedWriter: Send + std::fmt::Debug {
/// The column this writer is interested in.
fn column_name(&self) -> &str;

/// Observe a slice of column values as they are written to the current fragment.
/// Called once per batch.
fn observe_batch(&mut self, values: &ArrayRef) -> Result<()>;

/// Serialize accumulated state to bytes and reset for the next fragment.
/// Returns `None` if no data was observed (empty fragment).
fn finish(&mut self) -> Result<Option<Bytes>>;

/// Schema metadata key used to record that a seed buffer was written.
/// Convention: `"lance.seed.<column_name>"`.
fn schema_metadata_key(&self) -> String;

/// Create a string to store in the file's schema metadata. This will normally
/// contain the buffer index (provided by the caller after `add_global_buffer`)
/// as well as any other information needed to validate or understand the seed
/// (e.g. `rows_per_zone` for zone map seeds).
fn schema_metadata_value(&self, buf_index: u32) -> String;
}

/// A pre-harvested seed buffer from a single fragment's data file.
#[derive(Debug, Clone)]
pub struct FragmentSeed {
pub fragment_id: u64,
pub bytes: Bytes,
/// The raw value that was stored in the data file's schema metadata under
/// the seed key (i.e. the output of [`IndexSeedWriter::schema_metadata_value`]).
/// Plugins can inspect this to validate that the seed is compatible with the
/// current index configuration before consuming `bytes`.
pub metadata_value: String,
}
Loading
Loading