Skip to content

Commit 73ceb1d

Browse files
authored
Expose ColumnCloseResult on ArrowColumnChunk (#9773)
Adds `close()` and `close_mut()` accessors on `ArrowColumnChunk` so callers can inspect and mutate the `ColumnCloseResult` produced by `ArrowColumnWriter::close()` before appending the chunk to a row group. My motivation is dynamic deciding whether to omit the page index. Should not be controversial given `ColumnCloseResult` is already public. # Which issue does this PR close? - Closes #9774. # What changes are included in this PR? Adds accessor to `ArrowColumnChunk` # Are these changes tested? An unit test is included, also illustrating a potential use case. # Are there any user-facing changes? Yes the accessors are public.
1 parent 9a2b49c commit 73ceb1d

1 file changed

Lines changed: 70 additions & 0 deletions

File tree

  • parquet/src/arrow/arrow_writer

parquet/src/arrow/arrow_writer/mod.rs

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -765,6 +765,26 @@ impl std::fmt::Debug for ArrowColumnChunk {
765765
}
766766

767767
impl ArrowColumnChunk {
768+
/// Returns the [`ColumnCloseResult`] produced when the chunk was closed.
769+
///
770+
/// Exposes encoding information, collected statistics, and the optional
771+
/// [`ColumnIndexMetaData`](crate::file::page_index::column_index::ColumnIndexMetaData)
772+
/// / [`OffsetIndexMetaData`](crate::file::page_index::offset_index::OffsetIndexMetaData)
773+
/// gathered for the column chunk.
774+
pub fn close(&self) -> &ColumnCloseResult {
775+
&self.close
776+
}
777+
778+
/// Returns a mutable reference to the [`ColumnCloseResult`].
779+
///
780+
/// This allows callers to mutate the close result before the chunk is
781+
/// appended to a row group — for example, clearing `column_index` or
782+
/// `bloom_filter` based on a dynamic rule that inspects the encodings and
783+
/// collected page statistics.
784+
pub fn close_mut(&mut self) -> &mut ColumnCloseResult {
785+
&mut self.close
786+
}
787+
768788
/// Calls [`SerializedRowGroupWriter::append_column`] with this column's data
769789
pub fn append_to_row_group<W: Write + Send>(
770790
self,
@@ -5066,4 +5086,54 @@ mod tests {
50665086
let total_rows: i64 = sizes.iter().sum();
50675087
assert_eq!(total_rows, 100, "Total rows should be preserved");
50685088
}
5089+
5090+
#[test]
5091+
fn arrow_column_chunk_close_mut_drops_column_index() {
5092+
use crate::arrow::ArrowSchemaConverter;
5093+
use crate::file::writer::SerializedFileWriter;
5094+
5095+
let schema = Arc::new(Schema::new(vec![Field::new("i", DataType::Int32, false)]));
5096+
let props = Arc::new(
5097+
WriterProperties::builder()
5098+
.set_statistics_enabled(EnabledStatistics::Page)
5099+
.build(),
5100+
);
5101+
let parquet_schema = ArrowSchemaConverter::new()
5102+
.with_coerce_types(props.coerce_types())
5103+
.convert(&schema)
5104+
.unwrap();
5105+
5106+
let mut buf = Vec::with_capacity(1024);
5107+
let mut writer =
5108+
SerializedFileWriter::new(&mut buf, parquet_schema.root_schema_ptr(), props.clone())
5109+
.unwrap();
5110+
5111+
let factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema));
5112+
let mut col_writers = factory.create_column_writers(0).unwrap();
5113+
let arr: ArrayRef = Arc::new(Int32Array::from_iter_values(0..64));
5114+
for leaves in compute_leaves(schema.field(0), &arr).unwrap() {
5115+
col_writers[0].write(&leaves).unwrap();
5116+
}
5117+
let mut chunk = col_writers.pop().unwrap().close().unwrap();
5118+
5119+
// Immutable accessor exposes the close result produced at close time.
5120+
assert!(
5121+
chunk.close().column_index.is_some(),
5122+
"EnabledStatistics::Page should produce a column_index"
5123+
);
5124+
5125+
// Mutable accessor lets callers drop the page-level index before append.
5126+
chunk.close_mut().column_index = None;
5127+
assert!(chunk.close().column_index.is_none());
5128+
5129+
let mut rg = writer.next_row_group().unwrap();
5130+
chunk.append_to_row_group(&mut rg).unwrap();
5131+
rg.close().unwrap();
5132+
let file_meta = writer.close().unwrap();
5133+
5134+
// After dropping column_index, the resulting file records no column
5135+
// index offset/length for this chunk.
5136+
let cc = file_meta.row_group(0).column(0);
5137+
assert!(cc.column_index_range().is_none());
5138+
}
50695139
}

0 commit comments

Comments
 (0)