diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs index 79a1f247670c..37cfdb9ba155 100644 --- a/parquet/src/encodings/encoding/dict_encoder.rs +++ b/parquet/src/encodings/encoding/dict_encoder.rs @@ -64,7 +64,12 @@ impl Storage for KeyStorage { } fn estimated_memory_size(&self) -> usize { - self.size_in_bytes + self.uniques.capacity() * std::mem::size_of::() + let uniques_heap_bytes = match T::get_physical_type() { + Type::FIXED_LEN_BYTE_ARRAY => self.type_length * self.uniques.len(), + _ => ::variable_length_bytes(&self.uniques) + .unwrap_or(0) as usize, + }; + self.uniques.capacity() * std::mem::size_of::() + uniques_heap_bytes } } @@ -183,6 +188,214 @@ impl Encoder for DictEncoder { /// /// For this encoder, the indices are unencoded bytes (refer to [`Self::write_indices`]). fn estimated_memory_size(&self) -> usize { - self.interner.storage().size_in_bytes + self.indices.len() * std::mem::size_of::() + self.interner.estimated_memory_size() + + self.indices.capacity() * std::mem::size_of::() + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + use crate::data_type::{ + ByteArray, ByteArrayType, FixedLenByteArray, FixedLenByteArrayType, Int32Type, + }; + use crate::encodings::encoding::Encoder; + use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; + + fn make_col_desc() -> ColumnDescPtr { + make_col_desc_with_length::(-1) + } + + fn make_col_desc_with_length(type_length: i32) -> ColumnDescPtr { + let ty = SchemaType::primitive_type_builder("col", T::get_physical_type()) + .with_length(type_length) + .build() + .unwrap(); + Arc::new(ColumnDescriptor::new( + Arc::new(ty), + 0, + 0, + ColumnPath::new(vec![]), + )) + } + + #[test] + fn test_estimated_memory_size_primitive_with_duplicates() { + let mut encoder = DictEncoder::::new(make_col_desc::()); + let empty_size = encoder.estimated_memory_size(); + + // 3 distinct values, repeated to produce 9 indices total. + encoder.put(&[1, 2, 3, 1, 2, 3, 1, 2, 3]).unwrap(); + + let size = encoder.estimated_memory_size(); + + // Must account for the 3 unique dictionary entries. + let dict_entry_size = 3 * std::mem::size_of::(); + assert!( + size >= empty_size + dict_entry_size, + "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" + ); + + // Must also account for the 9 buffered indices. + let indices_size = 9 * std::mem::size_of::(); + assert!( + size >= empty_size + dict_entry_size + indices_size, + "memory size {size} should include indices ({indices_size} bytes)" + ); + } + + #[test] + fn test_estimated_memory_size_primitive_all_distinct() { + let mut encoder = DictEncoder::::new(make_col_desc::()); + let empty_size = encoder.estimated_memory_size(); + + let values: Vec = (0..100).collect(); + encoder.put(&values).unwrap(); + + let size = encoder.estimated_memory_size(); + + // Must account for the 100 unique dictionary entries. + let dict_entry_size = 100 * std::mem::size_of::(); + assert!( + size >= empty_size + dict_entry_size, + "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" + ); + + // Must also account for the 100 buffered indices. + let indices_size = 100 * std::mem::size_of::(); + assert!( + size >= empty_size + dict_entry_size + indices_size, + "memory size {size} should include indices ({indices_size} bytes)" + ); + } + + #[test] + fn test_estimated_memory_size_byte_array_with_duplicates() { + let mut encoder = DictEncoder::::new(make_col_desc::()); + let empty_size = encoder.estimated_memory_size(); + + // 3 distinct byte strings ("foo", "bar", "baz" — 3 bytes each), repeated to produce + // 9 indices total. + let vals: Vec = [ + "foo", "bar", "baz", "foo", "bar", "baz", "foo", "bar", "baz", + ] + .iter() + .map(|s| ByteArray::from(*s)) + .collect(); + encoder.put(&vals).unwrap(); + + let size = encoder.estimated_memory_size(); + + // Must account for the 3 unique dictionary entries, including their heap-allocated bytes. + let dict_entry_size = 3 * std::mem::size_of::() + 3 * 3; // 3 values × 3 bytes each + assert!( + size >= empty_size + dict_entry_size, + "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" + ); + + // Must also account for the 9 buffered indices. + let indices_size = 9 * std::mem::size_of::(); + assert!( + size >= empty_size + dict_entry_size + indices_size, + "memory size {size} should include indices ({indices_size} bytes)" + ); + } + + #[test] + fn test_estimated_memory_size_byte_array_all_distinct() { + let mut encoder = DictEncoder::::new(make_col_desc::()); + let empty_size = encoder.estimated_memory_size(); + + // 100 distinct values: "0".."9" (1 byte each) and "10".."99" (2 bytes each). + let values: Vec = (0..100_u32) + .map(|i| ByteArray::from(i.to_string().into_bytes())) + .collect(); + let bytes_total: usize = values.iter().map(|v| v.len()).sum(); // 10×1 + 90×2 = 190 + encoder.put(&values).unwrap(); + + let size = encoder.estimated_memory_size(); + + // Must account for the 100 unique dictionary entries, including their heap-allocated bytes. + let dict_entry_size = 100 * std::mem::size_of::() + bytes_total; + assert!( + size >= empty_size + dict_entry_size, + "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" + ); + + // Must also account for the 100 buffered indices. + let indices_size = 100 * std::mem::size_of::(); + assert!( + size >= empty_size + dict_entry_size + indices_size, + "memory size {size} should include indices ({indices_size} bytes)" + ); + } + + #[test] + fn test_estimated_memory_size_fixed_len_byte_array_with_duplicates() { + const TYPE_LEN: usize = 3; + let mut encoder = DictEncoder::::new(make_col_desc_with_length::< + FixedLenByteArrayType, + >(TYPE_LEN as i32)); + let empty_size = encoder.estimated_memory_size(); + + // 3 distinct 3-byte values, repeated to produce 9 indices total. + let vals = [ + b"foo", b"bar", b"baz", b"foo", b"bar", b"baz", b"foo", b"bar", b"baz", + ] + .iter() + .map(|b| FixedLenByteArray::from(b.to_vec())) + .collect::>(); + encoder.put(&vals).unwrap(); + + let size = encoder.estimated_memory_size(); + + // Must account for the 3 unique dictionary entries: struct overhead plus the + // fixed-length bytes allocated per entry. + let dict_entry_size = 3 * std::mem::size_of::() + 3 * TYPE_LEN; + assert!( + size >= empty_size + dict_entry_size, + "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" + ); + + // Must also account for the 9 buffered indices. + let indices_size = 9 * std::mem::size_of::(); + assert!( + size >= empty_size + dict_entry_size + indices_size, + "memory size {size} should include indices ({indices_size} bytes)" + ); + } + + #[test] + fn test_estimated_memory_size_fixed_len_byte_array_all_distinct() { + const TYPE_LEN: usize = 3; + let mut encoder = DictEncoder::::new(make_col_desc_with_length::< + FixedLenByteArrayType, + >(TYPE_LEN as i32)); + let empty_size = encoder.estimated_memory_size(); + + // 100 distinct 3-byte values: zero-padded big-endian u8 indices. + let values = (0..100_u8) + .map(|i| FixedLenByteArray::from(vec![0u8, 0u8, i])) + .collect::>(); + encoder.put(&values).unwrap(); + + let size = encoder.estimated_memory_size(); + + // Must account for the 100 unique dictionary entries: struct overhead plus the + // fixed-length bytes allocated per entry. + let dict_entry_size = 100 * std::mem::size_of::() + 100 * TYPE_LEN; + assert!( + size >= empty_size + dict_entry_size, + "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" + ); + + // Must also account for the 100 buffered indices. + let indices_size = 100 * std::mem::size_of::(); + assert!( + size >= empty_size + dict_entry_size + indices_size, + "memory size {size} should include indices ({indices_size} bytes)" + ); } } diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs index 34c7d1390f7a..deae3720d5cf 100644 --- a/parquet/src/util/interner.rs +++ b/parquet/src/util/interner.rs @@ -77,9 +77,7 @@ impl Interner { /// Return estimate of the memory used, in bytes #[allow(dead_code)] // not used in parquet_derive, so is dead there pub fn estimated_memory_size(&self) -> usize { - self.storage.estimated_memory_size() + - // estimate size of dedup hashmap as just th size of the keys - self.dedup.capacity() + std::mem::size_of::() + self.storage.estimated_memory_size() + self.dedup.allocation_size() } /// Returns the storage for this interner