diff --git a/be/benchmark/benchmark_main.cpp b/be/benchmark/benchmark_main.cpp index 32add5a16621b9..e36f61febbce93 100644 --- a/be/benchmark/benchmark_main.cpp +++ b/be/benchmark/benchmark_main.cpp @@ -20,6 +20,7 @@ #include "benchmark_bit_pack.hpp" #include "benchmark_fastunion.hpp" #include "benchmark_hll_merge.hpp" +#include "benchmark_zone_map_index.hpp" #include "binary_cast_benchmark.hpp" #include "core/block/block.h" #include "vec/columns/column_string.h" diff --git a/be/benchmark/benchmark_zone_map_index.hpp b/be/benchmark/benchmark_zone_map_index.hpp new file mode 100644 index 00000000000000..2fe0d41733b9a2 --- /dev/null +++ b/be/benchmark/benchmark_zone_map_index.hpp @@ -0,0 +1,257 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// ============================================================ +// Benchmark: TypedZoneMapIndexWriter::add_values +// +// Measures CPU cost of feeding values into the per-page zone-map +// builder for a few representative primitive types and call sizes. +// ============================================================ + +#pragma once + +#include + +#include +#include +#include +#include +#include + +#include "core/data_type/data_type_factory.hpp" +#include "core/string_ref.h" +#include "storage/field.h" +#include "storage/index/zone_map/zone_map_index.h" +#include "storage/tablet/tablet_schema.h" +#include "util/slice.h" + +namespace doris::segment_v2 { + +namespace bench_zone_map { + +constexpr size_t kTotalRows = 1 << 16; // 65536 rows fed per iteration +constexpr size_t kStoragePageSize = 65536; // STORAGE_PAGE_SIZE_DEFAULT_VALUE + +inline std::vector gen_int32(size_t n) { + std::mt19937 rng(0xC0FFEE); + std::uniform_int_distribution d(-1'000'000, 1'000'000); + std::vector v(n); + for (auto& x : v) x = d(rng); + return v; +} +inline std::vector gen_int64(size_t n) { + std::mt19937_64 rng(0xC0FFEE); + std::uniform_int_distribution d(-1'000'000'000LL, 1'000'000'000LL); + std::vector v(n); + for (auto& x : v) x = d(rng); + return v; +} +inline std::vector gen_double(size_t n) { + std::mt19937 rng(0xC0FFEE); + std::uniform_real_distribution d(-1e6, 1e6); + std::vector v(n); + for (auto& x : v) x = d(rng); + return v; +} +// Build a contiguous string buffer + Slice array (ValType for string is StringRef/Slice). +struct StringBatch { + std::vector data; + std::vector slices; +}; +inline StringBatch gen_strings(size_t n, size_t avg_len = 16) { + StringBatch b; + b.data.reserve(n); + b.slices.reserve(n); + std::mt19937 rng(0xC0FFEE); + std::uniform_int_distribution ch('a', 'z'); + for (size_t i = 0; i < n; ++i) { + std::string s(avg_len, 'a'); + for (auto& c : s) c = static_cast(ch(rng)); + b.data.emplace_back(std::move(s)); + } + for (auto& s : b.data) b.slices.emplace_back(s.data(), s.size()); + return b; +} + +inline TabletColumnPtr make_column(FieldType ft, int32_t length, int32_t index_length) { + auto c = std::make_shared(); + c->_unique_id = 0; + c->_col_name = "c"; + c->_type = ft; + c->_is_key = true; + c->_is_nullable = false; + c->_length = length; + c->_index_length = index_length; + return c; +} + +template +std::unique_ptr make_writer() { + TabletColumnPtr col; + DataTypePtr dtype; + if constexpr (PType == TYPE_INT) { + col = make_column(FieldType::OLAP_FIELD_TYPE_INT, 4, 4); + dtype = DataTypeFactory::instance().create_data_type(TYPE_INT, false); + } else if constexpr (PType == TYPE_BIGINT) { + col = make_column(FieldType::OLAP_FIELD_TYPE_BIGINT, 8, 8); + dtype = DataTypeFactory::instance().create_data_type(TYPE_BIGINT, false); + } else if constexpr (PType == TYPE_DOUBLE) { + col = make_column(FieldType::OLAP_FIELD_TYPE_DOUBLE, 8, 8); + dtype = DataTypeFactory::instance().create_data_type(TYPE_DOUBLE, false); + } else if constexpr (PType == TYPE_VARCHAR) { + col = make_column(FieldType::OLAP_FIELD_TYPE_VARCHAR, 64, 1); + dtype = DataTypeFactory::instance().create_data_type(TYPE_VARCHAR, false, 0, 0, 64); + } + std::unique_ptr field(StorageFieldFactory::create(*col)); + std::unique_ptr w; + (void)ZoneMapIndexWriter::create(dtype, field.get(), w); + return w; +} + +template +void run(benchmark::State& state, const Vec& values) { + const size_t batch = static_cast(state.range(0)); + const size_t total = values.size(); + for (auto _ : state) { + auto w = make_writer(); + size_t off = 0; + while (off < total) { + size_t n = std::min(batch, total - off); + w->add_values(reinterpret_cast(&values[off]), n); + off += n; + } + (void)w->flush(); + benchmark::DoNotOptimize(w); + } + state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(total)); +} + +// Simulates the ScalarColumnWriter call pattern in compaction: +// - merge iterator hands `block_rows`-row blocks to ColumnWriter::append +// - column_writer chunks each block by page remaining capacity and calls +// add_values() per chunk +// - when a page is full, finish_current_page() calls flush() on the zone +// map builder, then a new page begins +// +// Compaction batch_size is computed dynamically as +// `block_mem_limit / group_data_size` clamped to [32, 4064] +// (be/src/storage/merger.cpp:458). For wide rows / variant data it routinely +// drops to the low end (32 - 256), which is the case the flame graph exposes. +template +void run_column_writer_like(benchmark::State& state, const Vec& values, size_t elem_size) { + const size_t block_rows = static_cast(state.range(0)); + const size_t page_capacity = kStoragePageSize / elem_size; // e.g. 16384 for int32 + const size_t total = values.size(); + for (auto _ : state) { + auto w = make_writer(); + size_t off = 0; + size_t page_used = 0; + while (off < total) { + size_t block_left = std::min(block_rows, total - off); + while (block_left > 0) { + size_t n = std::min(block_left, page_capacity - page_used); + w->add_values(reinterpret_cast(&values[off]), n); + off += n; + block_left -= n; + page_used += n; + if (page_used == page_capacity) { + (void)w->flush(); + page_used = 0; + } + } + } + if (page_used) (void)w->flush(); + benchmark::DoNotOptimize(w); + } + state.SetItemsProcessed(int64_t(state.iterations()) * int64_t(total)); +} + +static void BM_ZoneMap_Int32(benchmark::State& state) { + static auto vals = gen_int32(kTotalRows); + run(state, vals); +} +static void BM_ZoneMap_Int64(benchmark::State& state) { + static auto vals = gen_int64(kTotalRows); + run(state, vals); +} +static void BM_ZoneMap_Double(benchmark::State& state) { + static auto vals = gen_double(kTotalRows); + run(state, vals); +} +static void BM_ZoneMap_String(benchmark::State& state) { + static auto batch = gen_strings(kTotalRows, 16); + run(state, batch.slices); +} + +BENCHMARK(BM_ZoneMap_Int32)->Arg(1)->Arg(64)->Arg(1024); +BENCHMARK(BM_ZoneMap_Int64)->Arg(1)->Arg(64)->Arg(1024); +BENCHMARK(BM_ZoneMap_Double)->Arg(1)->Arg(64)->Arg(1024); +BENCHMARK(BM_ZoneMap_String)->Arg(1)->Arg(64)->Arg(1024); + +// Realistic compaction-shaped: 1024-row blocks + page-driven flush(). +static void BM_ZoneMap_ColWriter_Int32(benchmark::State& state) { + static auto vals = gen_int32(kTotalRows); + run_column_writer_like(state, vals, sizeof(int32_t)); +} +static void BM_ZoneMap_ColWriter_Int64(benchmark::State& state) { + static auto vals = gen_int64(kTotalRows); + run_column_writer_like(state, vals, sizeof(int64_t)); +} +static void BM_ZoneMap_ColWriter_Double(benchmark::State& state) { + static auto vals = gen_double(kTotalRows); + run_column_writer_like(state, vals, sizeof(double)); +} +static void BM_ZoneMap_ColWriter_String(benchmark::State& state) { + static auto batch = gen_strings(kTotalRows, 16); + // For strings the page packs (size+payload); use ~32B avg per element. + run_column_writer_like(state, batch.slices, 32); +} +BENCHMARK(BM_ZoneMap_ColWriter_Int32) + ->Arg(1) + ->Arg(4) + ->Arg(16) + ->Arg(64) + ->Arg(256) + ->Arg(1024) + ->Arg(4096); +BENCHMARK(BM_ZoneMap_ColWriter_Int64) + ->Arg(1) + ->Arg(4) + ->Arg(16) + ->Arg(64) + ->Arg(256) + ->Arg(1024) + ->Arg(4096); +BENCHMARK(BM_ZoneMap_ColWriter_Double) + ->Arg(1) + ->Arg(4) + ->Arg(16) + ->Arg(64) + ->Arg(256) + ->Arg(1024) + ->Arg(4096); +BENCHMARK(BM_ZoneMap_ColWriter_String) + ->Arg(1) + ->Arg(4) + ->Arg(16) + ->Arg(64) + ->Arg(256) + ->Arg(1024) + ->Arg(4096); + +} // namespace bench_zone_map +} // namespace doris::segment_v2 diff --git a/be/src/core/field.h b/be/src/core/field.h index 75ce72f7553d44..aff03a7f4fb797 100644 --- a/be/src/core/field.h +++ b/be/src/core/field.h @@ -213,9 +213,7 @@ class Field { auto f = Field(PType); typename PrimitiveTypeTraits::CppType cpp_value; if constexpr (is_string_type(PType)) { - auto min_size = - MAX_ZONE_MAP_INDEX_SIZE >= data.size ? data.size : MAX_ZONE_MAP_INDEX_SIZE; - cpp_value = String(data.data, min_size); + cpp_value = String(data.data, data.size); } else if constexpr (is_date_or_datetime(PType)) { if constexpr (PType == TYPE_DATE) { cpp_value.from_olap_date(data); diff --git a/be/src/storage/index/zone_map/zone_map_index.cpp b/be/src/storage/index/zone_map/zone_map_index.cpp index 47414375db21ac..3c3a7ba6ed3e5a 100644 --- a/be/src/storage/index/zone_map/zone_map_index.cpp +++ b/be/src/storage/index/zone_map/zone_map_index.cpp @@ -123,17 +123,44 @@ TypedZoneMapIndexWriter::TypedZoneMapIndexWriter(DataTypePtr&& data_type) template void TypedZoneMapIndexWriter::_update_page_zonemap(const ValType& min_value, const ValType& max_value) { - auto min_field = doris::Field::create_field_from_olap_value(min_value); - auto max_field = doris::Field::create_field_from_olap_value(max_value); - if (!_page_zone_map.has_not_null || min_field < _page_zone_map.min_value) { - _page_zone_map.min_value = std::move(min_field); - } - if (!_page_zone_map.has_not_null || max_field > _page_zone_map.max_value) { - _page_zone_map.max_value = std::move(max_field); + // Hot path: compare/store using raw CppType to avoid Field temporaries. + // For string types, truncate to MAX_ZONE_MAP_INDEX_SIZE (matching the old + // Field-based path) and copy bytes into _page_{min,max}_storage so the + // StringRef stays valid across add_values() calls. + if constexpr (is_string_type(Type)) { + auto truncate_into = [](const StringRef& src, std::string& dst) { + auto sz = std::min(src.size, MAX_ZONE_MAP_INDEX_SIZE); + dst.assign(src.data, sz); + return StringRef(dst.data(), dst.size()); + }; + StringRef min_t(min_value.data, std::min(min_value.size, MAX_ZONE_MAP_INDEX_SIZE)); + StringRef max_t(max_value.data, std::min(max_value.size, MAX_ZONE_MAP_INDEX_SIZE)); + if (!_page_zone_map.has_not_null || min_t < _page_min) { + _page_min = truncate_into(min_value, _page_min_storage); + } + if (!_page_zone_map.has_not_null || _page_max < max_t) { + _page_max = truncate_into(max_value, _page_max_storage); + } + } else { + if (!_page_zone_map.has_not_null || min_value < _page_min) { + _page_min = min_value; + } + if (!_page_zone_map.has_not_null || max_value > _page_max) { + _page_max = max_value; + } } _page_zone_map.has_not_null = true; } +template +void TypedZoneMapIndexWriter::_materialize_page_minmax() { + if (!_page_zone_map.has_not_null) { + return; + } + _page_zone_map.min_value = doris::Field::create_field_from_olap_value(_page_min); + _page_zone_map.max_value = doris::Field::create_field_from_olap_value(_page_max); +} + template void TypedZoneMapIndexWriter::add_values(const void* values, size_t count) { if (count == 0) { @@ -196,14 +223,20 @@ void TypedZoneMapIndexWriter::reset_page_zone_map() { template Status TypedZoneMapIndexWriter::flush() { + // Materialize the running CppType min/max into the Field-typed page zone map + // before merging into the segment zone map / serializing to proto. + _materialize_page_minmax(); + // Update segment zone map. - if (!_segment_zone_map.has_not_null || - _segment_zone_map.min_value.get() > _page_zone_map.min_value.get()) { - _segment_zone_map.min_value = _page_zone_map.min_value; - } - if (!_segment_zone_map.has_not_null || - _segment_zone_map.max_value.get() < _page_zone_map.max_value.get()) { - _segment_zone_map.max_value = _page_zone_map.max_value; + if (_page_zone_map.has_not_null) { + if (!_segment_zone_map.has_not_null || + _segment_zone_map.min_value.get() > _page_zone_map.min_value.get()) { + _segment_zone_map.min_value = _page_zone_map.min_value; + } + if (!_segment_zone_map.has_not_null || + _segment_zone_map.max_value.get() < _page_zone_map.max_value.get()) { + _segment_zone_map.max_value = _page_zone_map.max_value; + } } if (_page_zone_map.has_null) { _segment_zone_map.has_null = true; diff --git a/be/src/storage/index/zone_map/zone_map_index.h b/be/src/storage/index/zone_map/zone_map_index.h index 9de968b5bdcac5..fe671f07fb3341 100644 --- a/be/src/storage/index/zone_map/zone_map_index.h +++ b/be/src/storage/index/zone_map/zone_map_index.h @@ -27,9 +27,9 @@ #include #include "common/status.h" -#include "core/arena.h" #include "core/data_type/data_type.h" #include "core/data_type/define_primitive_type.h" +#include "core/string_ref.h" #include "io/fs/file_reader_writer_fwd.h" #include "storage/field.h" #include "storage/metadata_adder.h" @@ -139,6 +139,9 @@ class TypedZoneMapIndexWriter final : public ZoneMapIndexWriter { private: void _reset_zone_map(ZoneMap* zone_map) { + // Do not reset min_value/max_value here: on the next page's first + // value write, min/max get updated and has_not_null is then set to + // true. zone_map->has_null = false; zone_map->has_not_null = false; zone_map->pass_all = false; @@ -149,13 +152,19 @@ class TypedZoneMapIndexWriter final : public ZoneMapIndexWriter { void _update_page_zonemap(const ValType& min_value, const ValType& max_value); + // Materialize the running CppType min/max into _page_zone_map.{min,max}_value. + // Called at flush() time, so the per-row hot path never constructs a Field. + void _materialize_page_minmax(); + DataTypePtr _data_type; - // memory will be managed by Arena ZoneMap _page_zone_map; ZoneMap _segment_zone_map; - // TODO(zc): we should replace this arena later, we only allocate min/max - // for field. But Arena allocate 4KB least, it will a waste for most cases. - Arena _arena; + // Running min/max for the current page kept as raw ValType. + // For string types, _page_min/_page_max are StringRefs that borrow into _page_min_storage/_page_max_storage. + ValType _page_min {}; + ValType _page_max {}; + std::string _page_min_storage; + std::string _page_max_storage; // serialized ZoneMapPB for each data page std::vector _values; diff --git a/be/test/storage/segment/zone_map_index_test.cpp b/be/test/storage/segment/zone_map_index_test.cpp index 7b3eecf1fe1e13..bc9d5f352a7d8f 100644 --- a/be/test/storage/segment/zone_map_index_test.cpp +++ b/be/test/storage/segment/zone_map_index_test.cpp @@ -1069,5 +1069,110 @@ TEST_F(ColumnZoneMapTest, TimestamptzPage) { delete field; } +// Regression test for "all-null page after a value page" — int variant. +// +// Page 1 has integers, page 2 is all nulls. The fix in flush() guards the +// segment merge with `if (_page_has_minmax)`. For ints the OLD code was +// incidentally correct because _reset_zone_map() doesn't touch min/max, +// so the stale page values still equal segment min/max and the merge is +// a no-op. This test pins that behavior. +TEST_F(ColumnZoneMapTest, AllNullPageAfterIntValues_SegmentMinMaxPreserved) { + TabletColumnPtr int_column = create_int_key(0); + std::unique_ptr field(StorageFieldFactory::create(*int_column)); + auto data_type_ptr = DataTypeFactory::instance().create_data_type(TYPE_INT, false); + + std::unique_ptr writer; + ASSERT_TRUE(ZoneMapIndexWriter::create(data_type_ptr, field.get(), writer).ok()); + + // Page 1: integers spanning [100, 200]. + std::vector values = {100, 150, 200}; + for (int32_t v : values) { + writer->add_values(&v, 1); + } + ASSERT_TRUE(writer->flush().ok()); + + // Page 2: all nulls. Without the _page_has_minmax guard the segment + // merge still runs with page 1's stale min/max; for ints this happens + // to be benign (identity merge), but the assertion keeps it that way. + writer->add_nulls(5); + ASSERT_TRUE(writer->flush().ok()); + + std::string file_path = kTestDir + "/all_null_after_int"; + io::FileWriterPtr file_writer; + ASSERT_TRUE(_fs->create_file(file_path, &file_writer).ok()); + ColumnIndexMetaPB index_meta; + ASSERT_TRUE(writer->finish(file_writer.get(), &index_meta).ok()); + ASSERT_TRUE(file_writer->close().ok()); + + const auto& seg_zm = index_meta.zone_map_index().segment_zone_map(); + EXPECT_TRUE(seg_zm.has_null()); + EXPECT_TRUE(seg_zm.has_not_null()); + EXPECT_EQ(std::to_string(100), seg_zm.min()); + EXPECT_EQ(std::to_string(200), seg_zm.max()); +} + +// Demonstrates the actual bug fixed by the `if (_page_has_minmax)` guard. +// +// Page 1 stores a single string of exactly MAX_ZONE_MAP_INDEX_SIZE bytes +// ending in 'x'; page 2 is all nulls. Trace on OLD code: +// * page 1 flush: +// - segment.max := page.max == "xxx...x" (un-modified copy) +// - modify_index_before_flush(_page_zone_map) increments the last +// byte of _page_zone_map.max_value in place: "xxx...x" -> "xxx...y" +// - _reset_zone_map() only clears flags; the modified "xxx...y" +// persists in _page_zone_map.max_value +// * page 2 flush (OLD, no _page_has_minmax guard): +// - segment.max("xxx...x") < _page_zone_map.max("xxx...y") -> TRUE +// - segment.max gets overwritten to "xxx...y" +// * finish(): +// - modify_index_before_flush(_segment_zone_map) increments again: +// "xxx...y" -> "xxx...z" // double-increment +// With the guard the page-2 segment merge is skipped, so segment.max +// stays "xxx...x" and finish() produces the correct single-incremented +// "xxx...y". +TEST_F(ColumnZoneMapTest, AllNullPageAfterMaxLenStringPage_NoSegmentMaxDoubleIncrement) { + auto data_type = DataTypeFactory::instance().create_data_type(TYPE_STRING, true); + auto tab_col = create_string_key(0); + std::unique_ptr field(StorageFieldFactory::create(*tab_col)); + + std::unique_ptr writer; + ASSERT_TRUE(ZoneMapIndexWriter::create(data_type, field.get(), writer).ok()); + + // Page 1: one string of exactly MAX_ZONE_MAP_INDEX_SIZE bytes, all 'x'. + std::string long_x(MAX_ZONE_MAP_INDEX_SIZE, 'x'); + Slice s(long_x); + writer->add_values(&s, 1); + ASSERT_TRUE(writer->flush().ok()); + + // Page 2: only nulls — triggers the all-null-page flush path. + writer->add_nulls(3); + ASSERT_TRUE(writer->flush().ok()); + + std::string file_path = kTestDir + "/all_null_after_long_string"; + io::FileWriterPtr file_writer; + ASSERT_TRUE(_fs->create_file(file_path, &file_writer).ok()); + ColumnIndexMetaPB index_meta; + ASSERT_TRUE(writer->finish(file_writer.get(), &index_meta).ok()); + ASSERT_TRUE(file_writer->close().ok()); + + const auto& seg_zm = index_meta.zone_map_index().segment_zone_map(); + EXPECT_TRUE(seg_zm.has_null()); + EXPECT_TRUE(seg_zm.has_not_null()); + + ASSERT_EQ(seg_zm.min().size(), MAX_ZONE_MAP_INDEX_SIZE); + EXPECT_EQ(seg_zm.min(), long_x); + + // Expected segment.max: long_x with last byte single-incremented to 'y'. + // Without the fix the last byte would be 'z' (double-incremented). + std::string expected_max = long_x; + expected_max.back() = 'y'; + ASSERT_EQ(seg_zm.max().size(), MAX_ZONE_MAP_INDEX_SIZE); + EXPECT_EQ(seg_zm.max(), expected_max) + << "BUG: all-null page 2 merged the already-modified " + "_page_zone_map.max_value into _segment_zone_map.max_value, " + "causing finish() to increment the last byte a second time."; + EXPECT_EQ(static_cast(seg_zm.max().back()), static_cast('y')); +} + } // namespace segment_v2 } // namespace doris