-
Notifications
You must be signed in to change notification settings - Fork 1.2k
fix: correct accounting in DictEncoder::estimated_memory_size, Interner::estimated_memory_size
#9720
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: correct accounting in DictEncoder::estimated_memory_size, Interner::estimated_memory_size
#9720
Changes from 7 commits
19002ab
70b0912
ffa40d9
07aba13
bf6a03a
52795a8
986b084
e454a9c
3dd032a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -64,7 +64,12 @@ impl<T: DataType> Storage for KeyStorage<T> { | |
| } | ||
|
|
||
| fn estimated_memory_size(&self) -> usize { | ||
| self.size_in_bytes + self.uniques.capacity() * std::mem::size_of::<T::T>() | ||
| let uniques_heap_bytes = match T::get_physical_type() { | ||
| Type::FIXED_LEN_BYTE_ARRAY => self.type_length * self.uniques.len(), | ||
| _ => <Self::Value as ParquetValueType>::variable_length_bytes(&self.uniques) | ||
| .unwrap_or(0) as usize, | ||
| }; | ||
| self.uniques.capacity() * std::mem::size_of::<T::T>() + uniques_heap_bytes | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -183,6 +188,214 @@ impl<T: DataType> Encoder<T> for DictEncoder<T> { | |
| /// | ||
| /// For this encoder, the indices are unencoded bytes (refer to [`Self::write_indices`]). | ||
| fn estimated_memory_size(&self) -> usize { | ||
| self.interner.storage().size_in_bytes + self.indices.len() * std::mem::size_of::<usize>() | ||
| self.interner.estimated_memory_size() | ||
| + self.indices.capacity() * std::mem::size_of::<usize>() | ||
| } | ||
| } | ||
|
|
||
| #[cfg(test)] | ||
| mod tests { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you @mzabaluev I ran these tests without your code change and they all passed. Thus I don't think the are covering whatever issue you have found andrewlamb@Andrews-MacBook-Pro-3:~/Software/arrow-rs$ git diff
diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs
index 37cfdb9ba1..2f32f9c0bb 100644
--- a/parquet/src/encodings/encoding/dict_encoder.rs
+++ b/parquet/src/encodings/encoding/dict_encoder.rs
@@ -64,12 +64,7 @@ impl<T: DataType> Storage for KeyStorage<T> {
}
fn estimated_memory_size(&self) -> usize {
- let uniques_heap_bytes = match T::get_physical_type() {
- Type::FIXED_LEN_BYTE_ARRAY => self.type_length * self.uniques.len(),
- _ => <Self::Value as ParquetValueType>::variable_length_bytes(&self.uniques)
- .unwrap_or(0) as usize,
- };
- self.uniques.capacity() * std::mem::size_of::<T::T>() + uniques_heap_bytes
+ self.size_in_bytes + self.uniques.capacity() * std::mem::size_of::<T::T>()
}
}
diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs
index deae3720d5..34c7d1390f 100644
--- a/parquet/src/util/interner.rs
+++ b/parquet/src/util/interner.rs
@@ -77,7 +77,9 @@ impl<S: Storage> Interner<S> {
/// Return estimate of the memory used, in bytes
#[allow(dead_code)] // not used in parquet_derive, so is dead there
pub fn estimated_memory_size(&self) -> usize {
- self.storage.estimated_memory_size() + self.dedup.allocation_size()
+ self.storage.estimated_memory_size() +
+ // estimate size of dedup hashmap as just th size of the keys
+ self.dedup.capacity() + std::mem::size_of::<S::Key>()
}
/// Returns the storage for this internerAnd then andrewlamb@Andrews-MacBook-Pro-3:~/Software/arrow-rs$ cargo test --lib -p parquet -- dict_encoder
Finished `test` profile [unoptimized + debuginfo] target(s) in 0.10s
Running unittests src/lib.rs (target/debug/deps/parquet-d5e640393e7492a1)
running 6 tests
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_primitive_with_duplicates ... ok
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_primitive_all_distinct ... ok
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_fixed_len_byte_array_with_duplicates ... ok
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_byte_array_with_duplicates ... ok
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_fixed_len_byte_array_all_distinct ... ok
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_byte_array_all_distinct ... ok
test result: ok. 6 passed; 0 failed; 0 ignored; 0 measured; 1051 filtered out; finished in 0.00s
andrewlamb@Andrews-MacBook-Pro-3:~/Software/arrow-rs$
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was another change. This makes @@ -188,8 +183,7 @@ impl<T: DataType> Encoder<T> for DictEncoder<T> {
///
/// For this encoder, the indices are unencoded bytes (refer to [`Self::write_indices`]).
fn estimated_memory_size(&self) -> usize {
- self.interner.estimated_memory_size()
- + self.indices.capacity() * std::mem::size_of::<usize>()
+ self.interner.storage().size_in_bytes + self.indices.len() * std::mem::size_of::<usize>()
}
}I have added more tests that fail with the baseline code. |
||
| use std::sync::Arc; | ||
|
|
||
| use super::*; | ||
| use crate::data_type::{ | ||
| ByteArray, ByteArrayType, FixedLenByteArray, FixedLenByteArrayType, Int32Type, | ||
| }; | ||
| use crate::encodings::encoding::Encoder; | ||
| use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType}; | ||
|
|
||
| fn make_col_desc<T: DataType>() -> ColumnDescPtr { | ||
| make_col_desc_with_length::<T>(-1) | ||
| } | ||
|
|
||
| fn make_col_desc_with_length<T: DataType>(type_length: i32) -> ColumnDescPtr { | ||
| let ty = SchemaType::primitive_type_builder("col", T::get_physical_type()) | ||
| .with_length(type_length) | ||
| .build() | ||
| .unwrap(); | ||
| Arc::new(ColumnDescriptor::new( | ||
| Arc::new(ty), | ||
| 0, | ||
| 0, | ||
| ColumnPath::new(vec![]), | ||
| )) | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_estimated_memory_size_primitive_with_duplicates() { | ||
| let mut encoder = DictEncoder::<Int32Type>::new(make_col_desc::<Int32Type>()); | ||
| let empty_size = encoder.estimated_memory_size(); | ||
|
|
||
| // 3 distinct values, repeated to produce 9 indices total. | ||
| encoder.put(&[1, 2, 3, 1, 2, 3, 1, 2, 3]).unwrap(); | ||
|
|
||
| let size = encoder.estimated_memory_size(); | ||
|
|
||
| // Must account for the 3 unique dictionary entries. | ||
| let dict_entry_size = 3 * std::mem::size_of::<i32>(); | ||
| assert!( | ||
| size >= empty_size + dict_entry_size, | ||
| "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" | ||
| ); | ||
|
|
||
| // Must also account for the 9 buffered indices. | ||
| let indices_size = 9 * std::mem::size_of::<usize>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. similarly this should probably be |
||
| assert!( | ||
| size >= empty_size + dict_entry_size + indices_size, | ||
| "memory size {size} should include indices ({indices_size} bytes)" | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_estimated_memory_size_primitive_all_distinct() { | ||
| let mut encoder = DictEncoder::<Int32Type>::new(make_col_desc::<Int32Type>()); | ||
| let empty_size = encoder.estimated_memory_size(); | ||
|
|
||
| let values: Vec<i32> = (0..100).collect(); | ||
| encoder.put(&values).unwrap(); | ||
|
|
||
| let size = encoder.estimated_memory_size(); | ||
|
|
||
| // Must account for the 100 unique dictionary entries. | ||
| let dict_entry_size = 100 * std::mem::size_of::<i32>(); | ||
| assert!( | ||
| size >= empty_size + dict_entry_size, | ||
| "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" | ||
| ); | ||
|
|
||
| // Must also account for the 100 buffered indices. | ||
| let indices_size = 100 * std::mem::size_of::<usize>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. here too |
||
| assert!( | ||
| size >= empty_size + dict_entry_size + indices_size, | ||
| "memory size {size} should include indices ({indices_size} bytes)" | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_estimated_memory_size_byte_array_with_duplicates() { | ||
| let mut encoder = DictEncoder::<ByteArrayType>::new(make_col_desc::<ByteArrayType>()); | ||
| let empty_size = encoder.estimated_memory_size(); | ||
|
|
||
| // 3 distinct byte strings ("foo", "bar", "baz" — 3 bytes each), repeated to produce | ||
| // 9 indices total. | ||
| let vals: Vec<ByteArray> = [ | ||
| "foo", "bar", "baz", "foo", "bar", "baz", "foo", "bar", "baz", | ||
| ] | ||
| .iter() | ||
| .map(|s| ByteArray::from(*s)) | ||
| .collect(); | ||
| encoder.put(&vals).unwrap(); | ||
|
|
||
| let size = encoder.estimated_memory_size(); | ||
|
|
||
| // Must account for the 3 unique dictionary entries, including their heap-allocated bytes. | ||
| let dict_entry_size = 3 * std::mem::size_of::<ByteArray>() + 3 * 3; // 3 values × 3 bytes each | ||
| assert!( | ||
| size >= empty_size + dict_entry_size, | ||
| "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" | ||
| ); | ||
|
|
||
| // Must also account for the 9 buffered indices. | ||
| let indices_size = 9 * std::mem::size_of::<usize>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and here |
||
| assert!( | ||
| size >= empty_size + dict_entry_size + indices_size, | ||
| "memory size {size} should include indices ({indices_size} bytes)" | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_estimated_memory_size_byte_array_all_distinct() { | ||
| let mut encoder = DictEncoder::<ByteArrayType>::new(make_col_desc::<ByteArrayType>()); | ||
| let empty_size = encoder.estimated_memory_size(); | ||
|
|
||
| // 100 distinct values: "0".."9" (1 byte each) and "10".."99" (2 bytes each). | ||
| let values: Vec<ByteArray> = (0..100_u32) | ||
| .map(|i| ByteArray::from(i.to_string().into_bytes())) | ||
| .collect(); | ||
| let bytes_total: usize = values.iter().map(|v| v.len()).sum(); // 10×1 + 90×2 = 190 | ||
| encoder.put(&values).unwrap(); | ||
|
|
||
| let size = encoder.estimated_memory_size(); | ||
|
|
||
| // Must account for the 100 unique dictionary entries, including their heap-allocated bytes. | ||
| let dict_entry_size = 100 * std::mem::size_of::<ByteArray>() + bytes_total; | ||
| assert!( | ||
| size >= empty_size + dict_entry_size, | ||
| "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" | ||
| ); | ||
|
|
||
| // Must also account for the 100 buffered indices. | ||
| let indices_size = 100 * std::mem::size_of::<usize>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and here |
||
| assert!( | ||
| size >= empty_size + dict_entry_size + indices_size, | ||
| "memory size {size} should include indices ({indices_size} bytes)" | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_estimated_memory_size_fixed_len_byte_array_with_duplicates() { | ||
| const TYPE_LEN: usize = 3; | ||
| let mut encoder = DictEncoder::<FixedLenByteArrayType>::new(make_col_desc_with_length::< | ||
| FixedLenByteArrayType, | ||
| >(TYPE_LEN as i32)); | ||
| let empty_size = encoder.estimated_memory_size(); | ||
|
|
||
| // 3 distinct 3-byte values, repeated to produce 9 indices total. | ||
| let vals = [ | ||
| b"foo", b"bar", b"baz", b"foo", b"bar", b"baz", b"foo", b"bar", b"baz", | ||
| ] | ||
| .iter() | ||
| .map(|b| FixedLenByteArray::from(b.to_vec())) | ||
| .collect::<Vec<_>>(); | ||
| encoder.put(&vals).unwrap(); | ||
|
|
||
| let size = encoder.estimated_memory_size(); | ||
|
|
||
| // Must account for the 3 unique dictionary entries: struct overhead plus the | ||
| // fixed-length bytes allocated per entry. | ||
| let dict_entry_size = 3 * std::mem::size_of::<FixedLenByteArray>() + 3 * TYPE_LEN; | ||
| assert!( | ||
| size >= empty_size + dict_entry_size, | ||
| "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" | ||
| ); | ||
|
|
||
| // Must also account for the 9 buffered indices. | ||
| let indices_size = 9 * std::mem::size_of::<usize>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and here |
||
| assert!( | ||
| size >= empty_size + dict_entry_size + indices_size, | ||
| "memory size {size} should include indices ({indices_size} bytes)" | ||
| ); | ||
| } | ||
|
|
||
| #[test] | ||
| fn test_estimated_memory_size_fixed_len_byte_array_all_distinct() { | ||
| const TYPE_LEN: usize = 3; | ||
| let mut encoder = DictEncoder::<FixedLenByteArrayType>::new(make_col_desc_with_length::< | ||
| FixedLenByteArrayType, | ||
| >(TYPE_LEN as i32)); | ||
| let empty_size = encoder.estimated_memory_size(); | ||
|
|
||
| // 100 distinct 3-byte values: zero-padded big-endian u8 indices. | ||
| let values = (0..100_u8) | ||
| .map(|i| FixedLenByteArray::from(vec![0u8, 0u8, i])) | ||
| .collect::<Vec<_>>(); | ||
| encoder.put(&values).unwrap(); | ||
|
|
||
| let size = encoder.estimated_memory_size(); | ||
|
|
||
| // Must account for the 100 unique dictionary entries: struct overhead plus the | ||
| // fixed-length bytes allocated per entry. | ||
| let dict_entry_size = 100 * std::mem::size_of::<FixedLenByteArray>() + 100 * TYPE_LEN; | ||
| assert!( | ||
| size >= empty_size + dict_entry_size, | ||
| "memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)" | ||
| ); | ||
|
|
||
| // Must also account for the 100 buffered indices. | ||
| let indices_size = 100 * std::mem::size_of::<usize>(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. and here |
||
| assert!( | ||
| size >= empty_size + dict_entry_size + indices_size, | ||
| "memory size {size} should include indices ({indices_size} bytes)" | ||
| ); | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indices is `Vec
But this is using
usizeThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Indeed, I did not notice that this was also wrong.