-
Notifications
You must be signed in to change notification settings - Fork 737
feat(index): write zone map seeds into data file footers during append #7427
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
43b6afb
a54ecc7
7b032cb
790ea89
1c09210
46311f2
eb576fb
a1f6a99
7c60154
338199a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,7 +24,19 @@ message BTreeIndexDetails {} | |
| message BitmapIndexDetails {} | ||
| message LabelListIndexDetails {} | ||
| message NGramIndexDetails {} | ||
| message ZoneMapIndexDetails {} | ||
| message ZoneMapIndexDetails { | ||
| // Number of rows per zone. Optional for backwards compatibility: absent on | ||
| // datasets written before this field was added. When absent, no seed writer | ||
| // is created for the index. | ||
| optional uint64 rows_per_zone = 1; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We will need a vote for this change. |
||
| // Whether seed-based incremental updates are enabled for this index. | ||
| // Absent (old datasets) or false: seeds disabled. | ||
| // true: seeds enabled; the index will embed per-fragment seed buffers in | ||
| // data files and harvest them during incremental updates to skip full scans. | ||
| // Defaults to true for variable-length column types (strings, binary) and | ||
| // fixed-width types wider than 8 bytes when not explicitly set by the user. | ||
| optional bool use_seeds = 2; | ||
| } | ||
| message InvertedIndexDetails { | ||
| // Marking this field as optional as old versions of the index store blank details and we | ||
| // need to make sure we have a proper optional field to detect this. | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,59 @@ | ||
| // SPDX-License-Identifier: Apache-2.0 | ||
| // SPDX-FileCopyrightText: Copyright The Lance Authors | ||
|
|
||
| //! Index seed writers — compact per-fragment summaries embedded in data files. | ||
| //! | ||
| //! A seed writer observes column values as they are written to a data file, | ||
| //! accumulates compact statistics in memory, and serializes them to a byte | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At the moment we assume the entire seed can comfortably fit in memory but in theory we could split it into multiple global buffers in the future if we needed to. |
||
| //! buffer that is embedded in the data file footer as a global buffer. | ||
| //! | ||
| //! The buffer can later be read back during index updates to reconstruct index | ||
| //! statistics without re-scanning the column data. | ||
|
|
||
| use arrow_array::ArrayRef; | ||
| use bytes::Bytes; | ||
| use lance_core::Result; | ||
|
|
||
| /// Schema metadata key prefix for all seed buffers: `"lance.seed.<column_name>"`. | ||
| pub const SEED_META_KEY_PREFIX: &str = "lance.seed."; | ||
|
|
||
| /// A hook registered during data file writes that observes column values batch | ||
| /// by batch, accumulates compact statistics in memory, and serializes them to | ||
| /// a byte buffer that is embedded in the data file footer as a global buffer. | ||
| /// | ||
| /// The buffer can later be read back during index updates to reconstruct index | ||
| /// statistics without re-scanning the column data. | ||
| pub trait IndexSeedWriter: Send + std::fmt::Debug { | ||
| /// The column this writer is interested in. | ||
| fn column_name(&self) -> &str; | ||
|
|
||
| /// Observe a slice of column values as they are written to the current fragment. | ||
| /// Called once per batch. | ||
| fn observe_batch(&mut self, values: &ArrayRef) -> Result<()>; | ||
|
|
||
| /// Serialize accumulated state to bytes and reset for the next fragment. | ||
| /// Returns `None` if no data was observed (empty fragment). | ||
| fn finish(&mut self) -> Result<Option<Bytes>>; | ||
|
|
||
| /// Schema metadata key used to record that a seed buffer was written. | ||
| /// Convention: `"lance.seed.<column_name>"`. | ||
| fn schema_metadata_key(&self) -> String; | ||
|
|
||
| /// Create a string to store in the file's schema metadata. This will normally | ||
| /// contain the buffer index (provided by the caller after `add_global_buffer`) | ||
| /// as well as any other information needed to validate or understand the seed | ||
| /// (e.g. `rows_per_zone` for zone map seeds). | ||
| fn schema_metadata_value(&self, buf_index: u32) -> String; | ||
| } | ||
|
|
||
| /// A pre-harvested seed buffer from a single fragment's data file. | ||
| #[derive(Debug, Clone)] | ||
| pub struct FragmentSeed { | ||
| pub fragment_id: u64, | ||
| pub bytes: Bytes, | ||
| /// The raw value that was stored in the data file's schema metadata under | ||
| /// the seed key (i.e. the output of [`IndexSeedWriter::schema_metadata_value`]). | ||
| /// Plugins can inspect this to validate that the seed is compatible with the | ||
| /// current index configuration before consuming `bytes`. | ||
| pub metadata_value: String, | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we are going to touch this file. I'm thinking we need a better name for it.