Skip to content
217 changes: 215 additions & 2 deletions parquet/src/encodings/encoding/dict_encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,12 @@ impl<T: DataType> Storage for KeyStorage<T> {
}

fn estimated_memory_size(&self) -> usize {
self.size_in_bytes + self.uniques.capacity() * std::mem::size_of::<T::T>()
let uniques_heap_bytes = match T::get_physical_type() {
Type::FIXED_LEN_BYTE_ARRAY => self.type_length * self.uniques.len(),
_ => <Self::Value as ParquetValueType>::variable_length_bytes(&self.uniques)
.unwrap_or(0) as usize,
};
self.uniques.capacity() * std::mem::size_of::<T::T>() + uniques_heap_bytes
}
}

Expand Down Expand Up @@ -183,6 +188,214 @@ impl<T: DataType> Encoder<T> for DictEncoder<T> {
///
/// For this encoder, the indices are unencoded bytes (refer to [`Self::write_indices`]).
fn estimated_memory_size(&self) -> usize {
self.interner.storage().size_in_bytes + self.indices.len() * std::mem::size_of::<usize>()
self.interner.estimated_memory_size()
+ self.indices.capacity() * std::mem::size_of::<usize>()
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indices is `Vec

    /// The buffered indices
    indices: Vec<u64>,

But this is using usize

Suggested change
+ self.indices.capacity() * std::mem::size_of::<usize>()
+ self.indices.capacity() * std::mem::size_of::<u64>()

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, I did not notice that this was also wrong.

}
}

#[cfg(test)]
mod tests {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you @mzabaluev

I ran these tests without your code change and they all passed. Thus I don't think the are covering whatever issue you have found

andrewlamb@Andrews-MacBook-Pro-3:~/Software/arrow-rs$ git diff
diff --git a/parquet/src/encodings/encoding/dict_encoder.rs b/parquet/src/encodings/encoding/dict_encoder.rs
index 37cfdb9ba1..2f32f9c0bb 100644
--- a/parquet/src/encodings/encoding/dict_encoder.rs
+++ b/parquet/src/encodings/encoding/dict_encoder.rs
@@ -64,12 +64,7 @@ impl<T: DataType> Storage for KeyStorage<T> {
     }

     fn estimated_memory_size(&self) -> usize {
-        let uniques_heap_bytes = match T::get_physical_type() {
-            Type::FIXED_LEN_BYTE_ARRAY => self.type_length * self.uniques.len(),
-            _ => <Self::Value as ParquetValueType>::variable_length_bytes(&self.uniques)
-                .unwrap_or(0) as usize,
-        };
-        self.uniques.capacity() * std::mem::size_of::<T::T>() + uniques_heap_bytes
+        self.size_in_bytes + self.uniques.capacity() * std::mem::size_of::<T::T>()
     }
 }

diff --git a/parquet/src/util/interner.rs b/parquet/src/util/interner.rs
index deae3720d5..34c7d1390f 100644
--- a/parquet/src/util/interner.rs
+++ b/parquet/src/util/interner.rs
@@ -77,7 +77,9 @@ impl<S: Storage> Interner<S> {
     /// Return estimate of the memory used, in bytes
     #[allow(dead_code)] // not used in parquet_derive, so is dead there
     pub fn estimated_memory_size(&self) -> usize {
-        self.storage.estimated_memory_size() + self.dedup.allocation_size()
+        self.storage.estimated_memory_size() +
+            // estimate size of dedup hashmap as just th size of the keys
+            self.dedup.capacity() + std::mem::size_of::<S::Key>()
     }

     /// Returns the storage for this interner

And then

andrewlamb@Andrews-MacBook-Pro-3:~/Software/arrow-rs$ cargo test --lib -p parquet -- dict_encoder
    Finished `test` profile [unoptimized + debuginfo] target(s) in 0.10s
     Running unittests src/lib.rs (target/debug/deps/parquet-d5e640393e7492a1)

running 6 tests
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_primitive_with_duplicates ... ok
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_primitive_all_distinct ... ok
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_fixed_len_byte_array_with_duplicates ... ok
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_byte_array_with_duplicates ... ok
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_fixed_len_byte_array_all_distinct ... ok
test encodings::encoding::dict_encoder::tests::test_estimated_memory_size_byte_array_all_distinct ... ok

test result: ok. 6 passed; 0 failed; 0 ignored; 0 measured; 1051 filtered out; finished in 0.00s

andrewlamb@Andrews-MacBook-Pro-3:~/Software/arrow-rs$

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There was another change. This makes ByteArray and FixedLenByteArray tests fail:

@@ -188,8 +183,7 @@ impl<T: DataType> Encoder<T> for DictEncoder<T> {
     ///
     /// For this encoder, the indices are unencoded bytes (refer to [`Self::write_indices`]).
     fn estimated_memory_size(&self) -> usize {
-        self.interner.estimated_memory_size()
-            + self.indices.capacity() * std::mem::size_of::<usize>()
+        self.interner.storage().size_in_bytes + self.indices.len() * std::mem::size_of::<usize>()
     }
 }

I have added more tests that fail with the baseline code.

use std::sync::Arc;

use super::*;
use crate::data_type::{
ByteArray, ByteArrayType, FixedLenByteArray, FixedLenByteArrayType, Int32Type,
};
use crate::encodings::encoding::Encoder;
use crate::schema::types::{ColumnDescriptor, ColumnPath, Type as SchemaType};

fn make_col_desc<T: DataType>() -> ColumnDescPtr {
make_col_desc_with_length::<T>(-1)
}

fn make_col_desc_with_length<T: DataType>(type_length: i32) -> ColumnDescPtr {
let ty = SchemaType::primitive_type_builder("col", T::get_physical_type())
.with_length(type_length)
.build()
.unwrap();
Arc::new(ColumnDescriptor::new(
Arc::new(ty),
0,
0,
ColumnPath::new(vec![]),
))
}

#[test]
fn test_estimated_memory_size_primitive_with_duplicates() {
let mut encoder = DictEncoder::<Int32Type>::new(make_col_desc::<Int32Type>());
let empty_size = encoder.estimated_memory_size();

// 3 distinct values, repeated to produce 9 indices total.
encoder.put(&[1, 2, 3, 1, 2, 3, 1, 2, 3]).unwrap();

let size = encoder.estimated_memory_size();

// Must account for the 3 unique dictionary entries.
let dict_entry_size = 3 * std::mem::size_of::<i32>();
assert!(
size >= empty_size + dict_entry_size,
"memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
);

// Must also account for the 9 buffered indices.
let indices_size = 9 * std::mem::size_of::<usize>();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

similarly this should probably be size_of<u64>

assert!(
size >= empty_size + dict_entry_size + indices_size,
"memory size {size} should include indices ({indices_size} bytes)"
);
}

#[test]
fn test_estimated_memory_size_primitive_all_distinct() {
let mut encoder = DictEncoder::<Int32Type>::new(make_col_desc::<Int32Type>());
let empty_size = encoder.estimated_memory_size();

let values: Vec<i32> = (0..100).collect();
encoder.put(&values).unwrap();

let size = encoder.estimated_memory_size();

// Must account for the 100 unique dictionary entries.
let dict_entry_size = 100 * std::mem::size_of::<i32>();
assert!(
size >= empty_size + dict_entry_size,
"memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
);

// Must also account for the 100 buffered indices.
let indices_size = 100 * std::mem::size_of::<usize>();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here too

assert!(
size >= empty_size + dict_entry_size + indices_size,
"memory size {size} should include indices ({indices_size} bytes)"
);
}

#[test]
fn test_estimated_memory_size_byte_array_with_duplicates() {
let mut encoder = DictEncoder::<ByteArrayType>::new(make_col_desc::<ByteArrayType>());
let empty_size = encoder.estimated_memory_size();

// 3 distinct byte strings ("foo", "bar", "baz" — 3 bytes each), repeated to produce
// 9 indices total.
let vals: Vec<ByteArray> = [
"foo", "bar", "baz", "foo", "bar", "baz", "foo", "bar", "baz",
]
.iter()
.map(|s| ByteArray::from(*s))
.collect();
encoder.put(&vals).unwrap();

let size = encoder.estimated_memory_size();

// Must account for the 3 unique dictionary entries, including their heap-allocated bytes.
let dict_entry_size = 3 * std::mem::size_of::<ByteArray>() + 3 * 3; // 3 values × 3 bytes each
assert!(
size >= empty_size + dict_entry_size,
"memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
);

// Must also account for the 9 buffered indices.
let indices_size = 9 * std::mem::size_of::<usize>();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and here

assert!(
size >= empty_size + dict_entry_size + indices_size,
"memory size {size} should include indices ({indices_size} bytes)"
);
}

#[test]
fn test_estimated_memory_size_byte_array_all_distinct() {
let mut encoder = DictEncoder::<ByteArrayType>::new(make_col_desc::<ByteArrayType>());
let empty_size = encoder.estimated_memory_size();

// 100 distinct values: "0".."9" (1 byte each) and "10".."99" (2 bytes each).
let values: Vec<ByteArray> = (0..100_u32)
.map(|i| ByteArray::from(i.to_string().into_bytes()))
.collect();
let bytes_total: usize = values.iter().map(|v| v.len()).sum(); // 10×1 + 90×2 = 190
encoder.put(&values).unwrap();

let size = encoder.estimated_memory_size();

// Must account for the 100 unique dictionary entries, including their heap-allocated bytes.
let dict_entry_size = 100 * std::mem::size_of::<ByteArray>() + bytes_total;
assert!(
size >= empty_size + dict_entry_size,
"memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
);

// Must also account for the 100 buffered indices.
let indices_size = 100 * std::mem::size_of::<usize>();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and here

assert!(
size >= empty_size + dict_entry_size + indices_size,
"memory size {size} should include indices ({indices_size} bytes)"
);
}

#[test]
fn test_estimated_memory_size_fixed_len_byte_array_with_duplicates() {
const TYPE_LEN: usize = 3;
let mut encoder = DictEncoder::<FixedLenByteArrayType>::new(make_col_desc_with_length::<
FixedLenByteArrayType,
>(TYPE_LEN as i32));
let empty_size = encoder.estimated_memory_size();

// 3 distinct 3-byte values, repeated to produce 9 indices total.
let vals = [
b"foo", b"bar", b"baz", b"foo", b"bar", b"baz", b"foo", b"bar", b"baz",
]
.iter()
.map(|b| FixedLenByteArray::from(b.to_vec()))
.collect::<Vec<_>>();
encoder.put(&vals).unwrap();

let size = encoder.estimated_memory_size();

// Must account for the 3 unique dictionary entries: struct overhead plus the
// fixed-length bytes allocated per entry.
let dict_entry_size = 3 * std::mem::size_of::<FixedLenByteArray>() + 3 * TYPE_LEN;
assert!(
size >= empty_size + dict_entry_size,
"memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
);

// Must also account for the 9 buffered indices.
let indices_size = 9 * std::mem::size_of::<usize>();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and here

assert!(
size >= empty_size + dict_entry_size + indices_size,
"memory size {size} should include indices ({indices_size} bytes)"
);
}

#[test]
fn test_estimated_memory_size_fixed_len_byte_array_all_distinct() {
const TYPE_LEN: usize = 3;
let mut encoder = DictEncoder::<FixedLenByteArrayType>::new(make_col_desc_with_length::<
FixedLenByteArrayType,
>(TYPE_LEN as i32));
let empty_size = encoder.estimated_memory_size();

// 100 distinct 3-byte values: zero-padded big-endian u8 indices.
let values = (0..100_u8)
.map(|i| FixedLenByteArray::from(vec![0u8, 0u8, i]))
.collect::<Vec<_>>();
encoder.put(&values).unwrap();

let size = encoder.estimated_memory_size();

// Must account for the 100 unique dictionary entries: struct overhead plus the
// fixed-length bytes allocated per entry.
let dict_entry_size = 100 * std::mem::size_of::<FixedLenByteArray>() + 100 * TYPE_LEN;
assert!(
size >= empty_size + dict_entry_size,
"memory size {size} should grow by at least the dict storage ({dict_entry_size} bytes)"
);

// Must also account for the 100 buffered indices.
let indices_size = 100 * std::mem::size_of::<usize>();
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

and here

assert!(
size >= empty_size + dict_entry_size + indices_size,
"memory size {size} should include indices ({indices_size} bytes)"
);
}
}
4 changes: 1 addition & 3 deletions parquet/src/util/interner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,7 @@ impl<S: Storage> Interner<S> {
/// Return estimate of the memory used, in bytes
#[allow(dead_code)] // not used in parquet_derive, so is dead there
pub fn estimated_memory_size(&self) -> usize {
self.storage.estimated_memory_size() +
// estimate size of dedup hashmap as just th size of the keys
self.dedup.capacity() + std::mem::size_of::<S::Key>()
self.storage.estimated_memory_size() + self.dedup.allocation_size()
}

/// Returns the storage for this interner
Expand Down
Loading