lance-format · westonpace · Jun 23, 2026 · Jun 24, 2026 · Jun 24, 2026 · Jun 24, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/protos/index_old.proto b/protos/index_old.proto
@@ -24,7 +24,19 @@ message BTreeIndexDetails {}
 message BitmapIndexDetails {}
 message LabelListIndexDetails {}
 message NGramIndexDetails {}
-message ZoneMapIndexDetails {}
+message ZoneMapIndexDetails {
+  // Number of rows per zone. Optional for backwards compatibility: absent on
+  // datasets written before this field was added. When absent, no seed writer
+  // is created for the index.
+  optional uint64 rows_per_zone = 1;
+  // Whether seed-based incremental updates are enabled for this index.
+  // Absent (old datasets) or false: seeds disabled.
+  // true: seeds enabled; the index will embed per-fragment seed buffers in
+  // data files and harvest them during incremental updates to skip full scans.
+  // Defaults to true for variable-length column types (strings, binary) and
+  // fixed-width types wider than 8 bytes when not explicitly set by the user.
+  optional bool use_seeds = 2;
+}
 message InvertedIndexDetails {
   // Marking this field as optional as old versions of the index store blank details and we
   // need to make sure we have a proper optional field to detect this.

diff --git a/rust/lance-index/Cargo.toml b/rust/lance-index/Cargo.toml
@@ -15,6 +15,7 @@ rust-version.workspace = true
 arc-swap.workspace = true
 arrow.workspace = true
 arrow-array.workspace = true
+arrow-ipc.workspace = true
 arrow-ord.workspace = true
 arrow-schema.workspace = true
 arrow-select.workspace = true

diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs
@@ -48,6 +48,7 @@ pub mod ngram;
 pub mod registry;
 #[cfg(feature = "geo")]
 pub mod rtree;
+pub mod seed;
 pub mod zoned;
 pub mod zonemap;
 

diff --git a/rust/lance-index/src/scalar/registry.rs b/rust/lance-index/src/scalar/registry.rs
@@ -4,7 +4,7 @@
 use std::borrow::Cow;
 use std::sync::Arc;
 
-use arrow_schema::Field;
+use arrow_schema::{DataType, Field};
 use async_trait::async_trait;
 use datafusion::execution::SendableRecordBatchStream;
 use lance_core::{
@@ -217,6 +217,55 @@ pub trait ScalarIndexPlugin: Send + Sync + std::fmt::Debug {
         // Return an empty JSON object as the default implementation
         Ok(serde_json::json!({}))
     }
+
+    /// Optionally create a seed writer for the given column.
+    ///
+    /// A seed writer observes column values during data file writes, accumulates
+    /// compact statistics in memory, and serializes them as a global buffer
+    /// embedded in the data file footer. The buffer is later harvested during
+    /// index updates to skip a full column scan.
+    ///
+    /// All parameters needed to construct the writer must be derivable from
+    /// `index_details` — this method must not perform any I/O. Return `Ok(None)`
+    /// if this index type does not support seed writing.
+    async fn create_seed_writer(
+        &self,
+        _field_path: &str,
+        _data_type: &DataType,
+        _index_details: &prost_types::Any,
+    ) -> Result<Option<Box<dyn super::seed::IndexSeedWriter>>> {
+        Ok(None)
+    }
+
+    /// Returns true if this index type may have seed buffers embedded in data
+    /// files for the given index configuration.
+    ///
+    /// When false the caller can skip opening data files to look for seeds
+    /// entirely, avoiding I/O for index types or configurations that never
+    /// write seeds.
+    fn might_use_seeds(&self, _index_details: &prost_types::Any) -> bool {
+        false
+    }
+
+    /// Attempt to update `reference_index` using pre-harvested `seeds` instead
+    /// of re-scanning column data.
+    ///
+    /// Each [`FragmentSeed`](super::seed::FragmentSeed) carries the raw bytes
+    /// written by the corresponding [`IndexSeedWriter`](super::seed::IndexSeedWriter)
+    /// and the original `metadata_value` stored in the data file, which the plugin
+    /// can use for compatibility validation (e.g. confirming `rows_per_zone`).
+    ///
+    /// Return `Ok(Some(created))` if the seed-based update succeeded, or
+    /// `Ok(None)` to signal that the caller should fall back to a full column scan.
+    async fn update_from_seeds(
+        &self,
+        _seeds: Vec<super::seed::FragmentSeed>,
+        _reference_index: Arc<dyn ScalarIndex>,
+        _index_details: &prost_types::Any,
+        _dest_store: &dyn IndexStore,
+    ) -> Result<Option<CreatedIndex>> {
+        Ok(None)
+    }
 }
 
 /// In-memory cache key for a whole `Arc<dyn ScalarIndex>`.

diff --git a/rust/lance-index/src/scalar/seed.rs b/rust/lance-index/src/scalar/seed.rs
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Index seed writers — compact per-fragment summaries embedded in data files.
+//!
+//! A seed writer observes column values as they are written to a data file,
+//! accumulates compact statistics in memory, and serializes them to a byte
+//! buffer that is embedded in the data file footer as a global buffer.
+//!
+//! The buffer can later be read back during index updates to reconstruct index
+//! statistics without re-scanning the column data.
+
+use arrow_array::ArrayRef;
+use bytes::Bytes;
+use lance_core::Result;
+
+/// Schema metadata key prefix for all seed buffers: `"lance.seed.<column_name>"`.
+pub const SEED_META_KEY_PREFIX: &str = "lance.seed.";
+
+/// A hook registered during data file writes that observes column values batch
+/// by batch, accumulates compact statistics in memory, and serializes them to
+/// a byte buffer that is embedded in the data file footer as a global buffer.
+///
+/// The buffer can later be read back during index updates to reconstruct index
+/// statistics without re-scanning the column data.
+pub trait IndexSeedWriter: Send + std::fmt::Debug {
+    /// The column this writer is interested in.
+    fn column_name(&self) -> &str;
+
+    /// Observe a slice of column values as they are written to the current fragment.
+    /// Called once per batch.
+    fn observe_batch(&mut self, values: &ArrayRef) -> Result<()>;
+
+    /// Serialize accumulated state to bytes and reset for the next fragment.
+    /// Returns `None` if no data was observed (empty fragment).
+    fn finish(&mut self) -> Result<Option<Bytes>>;
+
+    /// Schema metadata key used to record that a seed buffer was written.
+    /// Convention: `"lance.seed.<column_name>"`.
+    fn schema_metadata_key(&self) -> String;
+
+    /// Create a string to store in the file's schema metadata. This will normally
+    /// contain the buffer index (provided by the caller after `add_global_buffer`)
+    /// as well as any other information needed to validate or understand the seed
+    /// (e.g. `rows_per_zone` for zone map seeds).
+    fn schema_metadata_value(&self, buf_index: u32) -> String;
+}
+
+/// A pre-harvested seed buffer from a single fragment's data file.
+#[derive(Debug, Clone)]
+pub struct FragmentSeed {
+    pub fragment_id: u64,
+    pub bytes: Bytes,
+    /// The raw value that was stored in the data file's schema metadata under
+    /// the seed key (i.e. the output of [`IndexSeedWriter::schema_metadata_value`]).
+    /// Plugins can inspect this to validate that the seed is compatible with the
+    /// current index configuration before consuming `bytes`.
+    pub metadata_value: String,
+}