diff --git a/test/src/unit-capi-config.cc b/test/src/unit-capi-config.cc index f286faa91f5..cd6ba080a44 100644 --- a/test/src/unit-capi-config.cc +++ b/test/src/unit-capi-config.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2025 TileDB Inc. + * @copyright Copyright (c) 2017-2026 TileDB Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -277,6 +277,7 @@ void check_save_to_file() { << "\n"; ss << "sm.max_tile_overlap_size 314572800\n"; ss << "sm.mem.consolidation.buffers_weight 1\n"; + ss << "sm.mem.consolidation.initial_buffer_size 10485760\n"; ss << "sm.mem.consolidation.reader_weight 3\n"; ss << "sm.mem.consolidation.writer_weight 2\n"; ss << "sm.mem.malloc_trim true\n"; @@ -653,6 +654,7 @@ TEST_CASE("C API: Test config iter", "[capi][config]") { all_param_values["sm.query.condition_evaluator"] = Config::SM_QUERY_CONDITION_EVALUATOR; all_param_values["sm.query.sparse_unordered_with_dups.reader"] = "refactored"; + all_param_values["sm.mem.consolidation.initial_buffer_size"] = "10485760"; all_param_values["sm.mem.consolidation.buffers_weight"] = "1"; all_param_values["sm.mem.consolidation.reader_weight"] = "3"; all_param_values["sm.mem.consolidation.writer_weight"] = "2"; diff --git a/test/src/unit-capi-consolidation.cc b/test/src/unit-capi-consolidation.cc index e1907946285..69efbdb7d6c 100644 --- a/test/src/unit-capi-consolidation.cc +++ b/test/src/unit-capi-consolidation.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2021 TileDB Inc. + * @copyright Copyright (c) 2017-2026 TileDB Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -30,12 +30,15 @@ * Tests the C API consolidation. */ +#include #include +#include "test/support/src/error_helpers.h" #include "test/support/src/helpers.h" #include "test/support/src/vfs_helpers.h" #include "tiledb/common/stdx_string.h" #include "tiledb/platform/platform.h" #include "tiledb/sm/c_api/tiledb.h" +#include "tiledb/sm/consolidator/fragment_consolidator.h" #include "tiledb/sm/enums/encryption_type.h" #include "tiledb/sm/misc/tdb_time.h" #include "tiledb/storage_format/uri/parse_uri.h" @@ -46,6 +49,8 @@ using namespace tiledb::test; +using Asserter = AsserterCatch; + /** Tests for C API consolidation. */ struct ConsolidationFx { VFSTestSetup vfs_test_setup_; @@ -145,6 +150,12 @@ struct ConsolidationFx { void get_array_meta_files_dense(std::vector& files); void get_array_meta_vac_files_dense(std::vector& files); void get_vac_files(std::vector& files, bool dense = true); + void write_and_consolidate_fragments( + const char* array_name, + uint64_t num_small_cells, + uint64_t long_string_length, + uint64_t consolidation_budget, + tiledb_config_t* consolidate_cfg = nullptr); // Used to get the number of directories or files of another directory struct get_num_struct { @@ -7587,29 +7598,569 @@ TEST_CASE_METHOD( } } +/** + * Helper method which attempts to validate fragment consolidation by writing + * `num_small_cells` small cells before writing one large cell of length + * `long_string_length`. Consolidation will succeed up to some value of + * `long_string_length`, and fail after by disrespecting the memory budget. + * + * @param array_name The name of the array. + * @param num_small_cells The number of small cells to consolidate. + * @param long_string_length The length of the long string to write. + * @param consolidation_budget The total budget to set for consolidation. + * @param consolidate_cfg Optional cfg with additional set-consolidation params. + */ +void ConsolidationFx::write_and_consolidate_fragments( + const char* array_name, + uint64_t num_small_cells, + uint64_t long_string_length, + uint64_t consolidation_budget, + tiledb_config_t* consolidate_cfg) { + std::string words[8] = { + "foo", "bar", "apple", "orange", "banana", "red", "yellow", "blue"}; + + tiledb_config_t* cfg; + tiledb_error_t* err = nullptr; + TRY(ctx_, tiledb_config_alloc(&cfg, &err)); + REQUIRE(err == nullptr); + + // Create array + tiledb_dimension_t* dim; + uint64_t tile_extent = std::max(num_small_cells, long_string_length); + uint64_t dim_domain[] = {0, tile_extent + 1}; + TRY(ctx_, + tiledb_dimension_alloc( + ctx_, "dim", TILEDB_UINT64, &dim_domain, &tile_extent, &dim)); + tiledb_domain_t* domain; + TRY(ctx_, tiledb_domain_alloc(ctx_, &domain)); + TRY(ctx_, tiledb_domain_add_dimension(ctx_, domain, dim)); + tiledb_attribute_t* attr; + TRY(ctx_, tiledb_attribute_alloc(ctx_, "attr", TILEDB_CHAR, &attr)); + TRY(ctx_, + set_attribute_compression_filter(ctx_, attr, TILEDB_FILTER_GZIP, -1)); + TRY(ctx_, tiledb_attribute_set_cell_val_num(ctx_, attr, TILEDB_VAR_NUM)); + tiledb_array_schema_t* array_schema; + TRY(ctx_, tiledb_array_schema_alloc(ctx_, TILEDB_SPARSE, &array_schema)); + TRY(ctx_, + tiledb_array_schema_set_cell_order(ctx_, array_schema, TILEDB_ROW_MAJOR)); + TRY(ctx_, + tiledb_array_schema_set_tile_order(ctx_, array_schema, TILEDB_ROW_MAJOR)); + TRY(ctx_, tiledb_array_schema_set_capacity(ctx_, array_schema, 2)); + TRY(ctx_, tiledb_array_schema_set_domain(ctx_, array_schema, domain)); + TRY(ctx_, tiledb_array_schema_add_attribute(ctx_, array_schema, attr)); + TRY(ctx_, tiledb_array_schema_check(ctx_, array_schema)); + + if (encryption_type_ != TILEDB_NO_ENCRYPTION) { + std::string encryption_type_string = + encryption_type_str((tiledb::sm::EncryptionType)encryption_type_); + TRY(ctx_, + tiledb_config_set( + cfg, "sm.encryption_type", encryption_type_string.c_str(), &err)); + REQUIRE(err == nullptr); + TRY(ctx_, + tiledb_config_set(cfg, "sm.encryption_key", encryption_key_, &err)); + REQUIRE(err == nullptr); + // Do not remove the array when recreating context to set the new config + vfs_test_setup_.update_config(cfg); + ctx_ = vfs_test_setup_.ctx_c; + vfs_ = vfs_test_setup_.vfs_c; + } + TRY(ctx_, tiledb_array_create(ctx_, array_name, array_schema)); + tiledb_attribute_free(&attr); + tiledb_dimension_free(&dim); + tiledb_domain_free(&domain); + tiledb_array_schema_free(&array_schema); + + // Prepare to write many small cells to the array + std::vector small_cells_coords; + std::vector small_cells_offsets; + small_cells_coords.reserve(num_small_cells); + small_cells_offsets.reserve(num_small_cells); + small_cells_offsets.push_back(0); + std::string small_cells_string = ""; + for (uint64_t i = 0; i < num_small_cells; i++) { + std::string word = words[i % 8]; + small_cells_string += word; + small_cells_coords.push_back(i + 1); + if (i != num_small_cells - 1) { + small_cells_offsets.push_back(small_cells_offsets[i] + word.length()); + } + } + std::vector small_cells_vec( + small_cells_string.begin(), small_cells_string.end()); + uint64_t small_cells_vec_size = small_cells_vec.size(); + uint64_t small_cells_coords_size = + sizeof(uint64_t) * small_cells_coords.size(); + uint64_t small_cells_offsets_size = + sizeof(uint64_t) * small_cells_offsets.size(); + + // Write small cells to the array + tiledb_array_t* array; + TRY(ctx_, tiledb_array_alloc(ctx_, array_name, &array)); + TRY(ctx_, tiledb_array_open(ctx_, array, TILEDB_WRITE)); + tiledb_query_t* query; + TRY(ctx_, tiledb_query_alloc(ctx_, array, TILEDB_WRITE, &query)); + TRY(ctx_, tiledb_query_set_layout(ctx_, query, TILEDB_GLOBAL_ORDER)); + TRY(ctx_, + tiledb_query_set_data_buffer( + ctx_, + query, + "attr", + small_cells_string.data(), + &small_cells_vec_size)); + TRY(ctx_, + tiledb_query_set_offsets_buffer( + ctx_, + query, + "attr", + small_cells_offsets.data(), + &small_cells_offsets_size)); + TRY(ctx_, + tiledb_query_set_data_buffer( + ctx_, + query, + "dim", + small_cells_coords.data(), + &small_cells_coords_size)); + TRY(ctx_, tiledb_query_submit_and_finalize(ctx_, query)); + TRY(ctx_, tiledb_array_close(ctx_, array)); + tiledb_array_free(&array); + tiledb_query_free(&query); + + // Prepare to write long string to the array + const std::string test_chars = "abcdefghijklmnopqrstuvwxyz"; + std::vector long_string; + for (uint64_t i = 0; i < long_string_length; i++) { + long_string.emplace_back(test_chars[i % 26]); + } + uint64_t long_string_size = long_string.size(); + uint64_t long_string_offset = 0; + uint64_t long_string_offset_size = sizeof(uint64_t); + uint64_t long_string_coord = small_cells_coords.back(); + uint64_t long_string_coord_size = sizeof(uint64_t); + + // Write long string to the array + TRY(ctx_, tiledb_array_alloc(ctx_, array_name, &array)); + TRY(ctx_, tiledb_array_open(ctx_, array, TILEDB_WRITE)); + TRY(ctx_, tiledb_query_alloc(ctx_, array, TILEDB_WRITE, &query)); + TRY(ctx_, tiledb_query_set_layout(ctx_, query, TILEDB_GLOBAL_ORDER)); + TRY(ctx_, + tiledb_query_set_data_buffer( + ctx_, query, "attr", long_string.data(), &long_string_size)); + TRY(ctx_, + tiledb_query_set_offsets_buffer( + ctx_, query, "attr", &long_string_offset, &long_string_offset_size)); + TRY(ctx_, + tiledb_query_set_data_buffer( + ctx_, query, "dim", &long_string_coord, &long_string_coord_size)); + TRY(ctx_, tiledb_query_submit_and_finalize(ctx_, query)); + TRY(ctx_, tiledb_array_close(ctx_, array)); + tiledb_array_free(&array); + tiledb_query_free(&query); + + // Consolidate, using caller's config if provided. + tiledb_config_t* consolidation_cfg = + (consolidate_cfg != nullptr) ? consolidate_cfg : cfg; + TRY(ctx_, + tiledb_config_set( + consolidation_cfg, + "sm.mem.total_budget", + std::to_string(consolidation_budget).c_str(), + &err)); + REQUIRE(err == nullptr); + TRY(ctx_, + tiledb_config_set( + consolidation_cfg, "sm.consolidation.step_min_frags", "2", &err)); + REQUIRE(err == nullptr); + + throw_if_error( + ctx_, tiledb_array_consolidate(ctx_, array_name, consolidation_cfg)); + tiledb_config_free(&cfg); + + // Ensure there is only 1 fragment after consolidation. + tiledb_fragment_info_t* fragment_info = nullptr; + TRY(ctx_, tiledb_fragment_info_alloc(ctx_, array_name, &fragment_info)); + TRY(ctx_, tiledb_fragment_info_load(ctx_, fragment_info)); + uint32_t fragment_num; + TRY(ctx_, + tiledb_fragment_info_get_fragment_num( + ctx_, fragment_info, &fragment_num)); + CHECK(fragment_num == 1); + tiledb_fragment_info_free(&fragment_info); + + // Validate data after consolidation. + uint64_t read_data_size = long_string_size + small_cells_vec_size; + uint64_t read_offset_size = + small_cells_offsets_size + long_string_offset_size; + uint64_t read_coords_size = small_cells_coords_size + long_string_coord_size; + auto read_data_buffer = (char*)malloc(read_data_size); + auto read_offset_buffer = (uint64_t*)malloc(read_offset_size); + auto read_coords_buffer = (uint64_t*)malloc(read_coords_size); + TRY(ctx_, tiledb_array_alloc(ctx_, array_name, &array)); + TRY(ctx_, tiledb_array_open(ctx_, array, TILEDB_READ)); + TRY(ctx_, tiledb_query_alloc(ctx_, array, TILEDB_READ, &query)); + TRY(ctx_, tiledb_query_set_layout(ctx_, query, TILEDB_GLOBAL_ORDER)); + TRY(ctx_, + tiledb_query_set_data_buffer( + ctx_, query, "attr", read_data_buffer, &read_data_size)); + TRY(ctx_, + tiledb_query_set_offsets_buffer( + ctx_, query, "attr", read_offset_buffer, &read_offset_size)); + TRY(ctx_, + tiledb_query_set_data_buffer( + ctx_, query, "dim", read_coords_buffer, &read_coords_size)); + TRY(ctx_, tiledb_query_submit(ctx_, query)); + TRY(ctx_, tiledb_array_close(ctx_, array)); + tiledb_array_free(&array); + tiledb_query_free(&query); + // Note: There's a bug which drops the last 4 bits of the first fragment. + // #TODO Update this test once the bug is fixed. + std::vector expected_values( + small_cells_string.begin(), small_cells_string.end() - 4); + expected_values.insert( + expected_values.end(), long_string.begin(), long_string.end()); + for (uint64_t i = 0; i < read_data_size; i++) { + CHECK(read_data_buffer[i] == expected_values[i]); + } +} + TEST_CASE_METHOD( ConsolidationFx, - "C API: Test consolidation, sparse string, no progress", - "[capi][consolidation][sparse][string][no-progress][non-rest]") { - remove_sparse_string_array(); - create_sparse_string_array(); + "C API: Test sparse fragment consolidation", + "[capi][consolidation][fragment][sparse][non-rest]") { + const char* array_name = "fragment_consolidation_array"; + remove_array(array_name); + + uint64_t num_small_cells = 10000; + uint64_t long_string_length = 10000; + uint64_t consolidation_budget = 10000000; + std::string expected_error_msg = ""; + + SECTION( + "Success: " + "num small cells = 10000, " + "long string length = 10000, " + "consolidation_budget = 10000000") { + num_small_cells = 10000; + long_string_length = 10000; + consolidation_budget = 10000000; + CHECK_NOTHROW(write_and_consolidate_fragments( + array_name, num_small_cells, long_string_length, consolidation_budget)); + } - write_sparse_string_full(); - write_sparse_string_unordered(); - consolidate_sparse_string(1, true); + // #TODO update intercept tests. These are currently out-of-date. + // #TODO decide what meaningful information to gather & report... + /*SECTION("Success; validate reader thread's wait conditions: ") { + // This test explicitly validates the edge cases of the reader thread's wait + // conditions in `FragmentConsolidator::copy_array`. + // + // If the so-called `next_buffer_size` to enqueue will put the + // `enqueued_buffer_size` over the `max_buffer_size`, the reader must wait + // for the writer to dequeue buffers until there's room. + // + // This test validates that this condition occurs: + // 1. At the end of a read iteration + // 2. After buffer growth + num_small_cells = 10000; + consolidation_budget = 10000000; + + int end_of_reader_count = 0; + int after_buffer_growth_count = 0; + auto validate_wait = + tiledb::sm::intercept::fragment_consolidator_copy_array().and_also( + [&end_of_reader_count, &after_buffer_growth_count]( + const uint64_t& enqueued_buffer_size, + uint64_t& next_buffer_size, + uint64_t& max_buffer_size, + bool buffer_has_grown) { + if (buffer_has_grown) { + after_buffer_growth_count++; + CHECK(enqueued_buffer_size != 0); + CHECK(next_buffer_size <= max_buffer_size); + } else { + end_of_reader_count++; + // #TODO + } + //CHECK(enqueued_buffer_size + next_buffer_size > + max_buffer_size); + }); + + SECTION("no wait") { + // The entire fragment can fit in one read / write, so no reader waiting + long_string_length = 10000; + CHECK_NOTHROW(write_and_consolidate_fragments( + array_name, + num_small_cells, + long_string_length, + consolidation_budget)); + CHECK(end_of_reader_count == 0); + CHECK(after_buffer_growth_count == 0); + } - tiledb_error_t* err = NULL; - tiledb_ctx_get_last_error(ctx_, &err); + SECTION("at the end of read iteration") { + // The buffer can fit and will not grow here + long_string_length = 10000; + + tiledb_config_t* cfg; + tiledb_error_t* err = nullptr; + TRY(ctx_, tiledb_config_alloc(&cfg, &err)); + REQUIRE(err == nullptr); + TRY(ctx_, + tiledb_config_set( + cfg, + "sm.mem.consolidation.initial_buffer_size", + std::to_string(8 * (num_small_cells + long_string_length)) + .c_str(), + &err)); + REQUIRE(err == nullptr); + + CHECK_NOTHROW(write_and_consolidate_fragments( + array_name, + num_small_cells, + long_string_length, + consolidation_budget, + cfg)); + tiledb_config_free(&cfg); + //CHECK(end_of_reader_count == 2); + CHECK(after_buffer_growth_count == 0); + } + + SECTION("after buffer growth") { + // Write a fragment that is 4x `initial_buffer_size` to ensure 4x growth + uint64_t initial_buffer_size = 10000; + long_string_length = 4 * initial_buffer_size; + + tiledb_config_t* cfg; + tiledb_error_t* err = nullptr; + TRY(ctx_, tiledb_config_alloc(&cfg, &err)); + REQUIRE(err == nullptr); + TRY(ctx_, + tiledb_config_set( + cfg, + "sm.mem.consolidation.initial_buffer_size", + std::to_string(initial_buffer_size).c_str(), + &err)); + REQUIRE(err == nullptr); + // Do not remove the array when recreating context to set the new config + vfs_test_setup_.update_config(cfg); + ctx_ = vfs_test_setup_.ctx_c; + vfs_ = vfs_test_setup_.vfs_c; + + CHECK_NOTHROW(write_and_consolidate_fragments( + array_name, + num_small_cells, + long_string_length, + consolidation_budget, + cfg)); + tiledb_config_free(&cfg); + CHECK(end_of_reader_count > 8); + CHECK(after_buffer_growth_count == 4); + } + }*/ + + // #TODO This hangs unless `initial_buffer_budget` is floored by a factor of 8 + /*SECTION( + "Error after buffer growth: " + "num small cells = 10000, " + "long string length = 5000000, " + "consolidation_budget = 10000000") { + expected_error_msg = " Unable to copy one slab with current budget/buffers"; + num_small_cells = 10000; + long_string_length = 5000000; + consolidation_budget = 10000000; + CHECK_THROWS_WITH( + write_and_consolidate_fragments( + array_name, + num_small_cells, + long_string_length, + consolidation_budget), + Catch::Matchers::ContainsSubstring(expected_error_msg)); + }*/ + + SECTION( + "Error attempting to load R-tree: " + "num small cells = 100000, " + "long string length = 100000, " + "consolidation_budget = 10000000 ") { + expected_error_msg = "Cannot load R-tree; Insufficient memory budget"; + num_small_cells = 100000; + long_string_length = 100000; + consolidation_budget = 10000000; + CHECK_THROWS_WITH( + write_and_consolidate_fragments( + array_name, + num_small_cells, + long_string_length, + consolidation_budget), + Catch::Matchers::ContainsSubstring(expected_error_msg)); + } - const char* msg; - tiledb_error_message(err, &msg); - CHECK( - std::string("FragmentConsolidator: Consolidation read 0 cells, no " - "progress can be made") == msg); + SECTION( + "Error loading tile offsets: " + "num small cells = 1000, " + "long string length = 1000, " + "consolidation_budget = 500000 ") { + expected_error_msg = "Cannot load tile offsets"; + + num_small_cells = 1000; + long_string_length = 1000; + consolidation_budget = 500000; + CHECK_THROWS_WITH( + write_and_consolidate_fragments( + array_name, + num_small_cells, + long_string_length, + consolidation_budget), + Catch::Matchers::ContainsSubstring(expected_error_msg)); + } - remove_sparse_string_array(); + // #TODO This hangs unless `initial_buffer_budget` is floored by a factor of 8 + /*sSECTION( + "Error after buffer growth: " + "num small cells = 2, " + "long string length = 20000, " + "consolidation_budget = 50000 ") { + expected_error_msg = "Consolidation read 0 cells"; + + num_small_cells = 2; + long_string_length = 20000; + consolidation_budget = 50000; + CHECK_THROWS_WITH( + write_and_consolidate_fragments( + array_name, + num_small_cells, + long_string_length, + consolidation_budget), + Catch::Matchers::ContainsSubstring(expected_error_msg)); + }*/ + + SECTION( + "Success when setting a too-large initial_buffer_size: " + "num small cells = 10000, " + "long string length = 10000, " + "consolidation_budget = 10000000, " + "initial_buffer_size = consolidation_budget") { + // Note: rather than fail with "Consolidation read 0 cells", + // the consolidator will use `initial_buffer_size` == `buffer_budget` + num_small_cells = 10000; + long_string_length = 10000; + consolidation_budget = 10000000; + uint64_t initial_buffer_size = consolidation_budget; + + tiledb_config_t* cfg; + tiledb_error_t* err = nullptr; + TRY(ctx_, tiledb_config_alloc(&cfg, &err)); + REQUIRE(err == nullptr); + TRY(ctx_, + tiledb_config_set( + cfg, + "sm.mem.consolidation.initial_buffer_size", + std::to_string(initial_buffer_size).c_str(), + &err)); + REQUIRE(err == nullptr); + + CHECK_NOTHROW(write_and_consolidate_fragments( + array_name, + num_small_cells, + long_string_length, + consolidation_budget, + cfg)); + tiledb_config_free(&cfg); + } + + SECTION( + "Success when setting deprecated param buffer_size too large: " + "num small cells = 10000, " + "long string length = 10000, " + "consolidation_budget = 10000000, " + "buffer_size = consolidation_budget") { + // Note: rather than fail with "Consolidation read 0 cells", + // the consolidator will use `buffer_size` == `buffer_budget` + num_small_cells = 10000; + long_string_length = 10000; + consolidation_budget = 10000000; + uint64_t buffer_size = consolidation_budget; + + tiledb_config_t* cfg; + tiledb_error_t* err = nullptr; + TRY(ctx_, tiledb_config_alloc(&cfg, &err)); + REQUIRE(err == nullptr); + TRY(ctx_, + tiledb_config_set( + cfg, + "sm.consolidation.buffer_size", + std::to_string(buffer_size).c_str(), + &err)); + REQUIRE(err == nullptr); + + CHECK_NOTHROW(write_and_consolidate_fragments( + array_name, + num_small_cells, + long_string_length, + consolidation_budget, + cfg)); + tiledb_config_free(&cfg); + } + + remove_array(array_name); } +// Commenting out; will probably need to remove +/*TEST_CASE("C API: Fragment consolidation benchmark", + "[capi][consolidation][fragments][benchmark][non-rest]") { + // original array: + // +s3://tiledb-spencer/customers/cellarity/soma/index_siletti23_allenbrainatlas_cellarity_2/obs/ + // ~200MB array + // 1 dim, 53 nullable, var-sized attrs + // grew the array to ~7.5GB via Tables: + // CREATE EXTERNAL TABLE benchmark STORED AS tiledb LOCATION ; + // INSERT INTO benchmark SELECT * FROM benchmark; + + const char* array_name = "benchmark_array"; + tiledb_ctx_t* ctx; + tiledb_ctx_alloc(NULL, &ctx); + + tiledb_vfs_t* vfs; + tiledb_vfs_alloc(ctx, NULL, &vfs); + int is_dir = 0; + tiledb_vfs_is_dir(ctx, vfs, array_name, &is_dir); + tiledb_vfs_free(&vfs); + if (!is_dir) { + tiledb_ctx_free(&ctx); + SKIP("Benchmark array does not exist - create benchmark_array to run +this test"); + } + + uint64_t initial_buffer_size = 2.5 * 1073741824; // 2.5GB; + tiledb_config_t* cfg; + tiledb_error_t* err = nullptr; + TRY(ctx, tiledb_config_alloc(&cfg, &err)); + REQUIRE(err == nullptr); + TRY(ctx, + tiledb_config_set( + cfg, + "sm.mem.consolidation.initial_buffer_size", + std::to_string(initial_buffer_size).c_str(), + &err)); + REQUIRE(err == nullptr); + + auto begin = std::chrono::high_resolution_clock::now(); + TRY(ctx, tiledb_array_consolidate(ctx, array_name, cfg)); + auto end = std::chrono::high_resolution_clock::now(); + + auto elapsed_sec = + std::chrono::duration_cast(end - begin); + auto elapsed_msec = + std::chrono::duration_cast(end - begin); + + std::cerr<<"Time elapsed, seconds: "< error_if_any(tiledb_ctx_t* ctx, auto apirc) { } } +/** + * Asserts that a C API call does not return error. + */ +template +void capi_try(tiledb_ctx_t* ctx, int rc) { + // suppress the std::optional false positive from gcc 13 +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + const std::optional maybe_err = + tiledb::test::error_if_any(ctx, rc); + ASSERTER(maybe_err == std::optional{}); +#pragma GCC diagnostic pop +#else + const std::optional maybe_err = + tiledb::test::error_if_any(ctx, rc); + ASSERTER(maybe_err == std::optional{}); +#endif +} + /** * Throws a `std::runtime_error` if the operation returning `thing` * did not return `TILEDB_OK`. @@ -102,4 +112,9 @@ void throw_if_error(tiledb_ctx_t* ctx, capi_return_t thing); } // namespace tiledb::test +/** + * Asserts that a C API call does not return error. + */ +#define TRY(ctx, thing) tiledb::test::capi_try(ctx, thing) + #endif diff --git a/tiledb/api/c_api/config/config_api_external.h b/tiledb/api/c_api/config/config_api_external.h index 0492b28edfc..b6a729b34bc 100644 --- a/tiledb/api/c_api/config/config_api_external.h +++ b/tiledb/api/c_api/config/config_api_external.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2023-2025 TileDB, Inc. + * @copyright Copyright (c) 2023-2026 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -292,6 +292,10 @@ TILEDB_EXPORT void tiledb_config_free(tiledb_config_t** config) TILEDB_NOEXCEPT; * - `sm.mem.total_budget`
* Memory budget for readers and writers.
* **Default**: 10GB + * - `sm.mem.consolidation.initial_buffer_size`
+ * The initial size of the consolidation buffers before growth. The buffers + * will remain within the budgeted range. + * **Default**: 10MB * - `sm.mem.consolidation.buffers_weight`
* Weight used to split `sm.mem.total_budget` and assign to the * consolidation buffers. The budget is split across 3 values, diff --git a/tiledb/sm/config/config.cc b/tiledb/sm/config/config.cc index 3086b49aa1f..b5ca42ef385 100644 --- a/tiledb/sm/config/config.cc +++ b/tiledb/sm/config/config.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2025 TileDB, Inc. + * @copyright Copyright (c) 2017-2026 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -112,6 +112,8 @@ const std::string Config::SM_QUERY_CONDITION_EVALUATOR = "ast"; const std::string Config::SM_MEM_MALLOC_TRIM = "true"; const std::string Config::SM_UPPER_MEMORY_LIMIT = "1073741824"; // 1GB const std::string Config::SM_MEM_TOTAL_BUDGET = "10737418240"; // 10GB +const std::string Config::SM_MEM_CONSOLIDATION_INITIAL_BUFFER_SIZE = + "10485760"; // 10MB const std::string Config::SM_MEM_CONSOLIDATION_BUFFERS_WEIGHT = "1"; const std::string Config::SM_MEM_CONSOLIDATION_READER_WEIGHT = "3"; const std::string Config::SM_MEM_CONSOLIDATION_WRITER_WEIGHT = "2"; @@ -318,6 +320,9 @@ const std::map default_config_values = { std::make_pair( "sm.mem.tile_upper_memory_limit", Config::SM_UPPER_MEMORY_LIMIT), std::make_pair("sm.mem.total_budget", Config::SM_MEM_TOTAL_BUDGET), + std::make_pair( + "sm.mem.consolidation.initial_buffer_size", + Config::SM_MEM_CONSOLIDATION_INITIAL_BUFFER_SIZE), std::make_pair( "sm.mem.consolidation.buffers_weight", Config::SM_MEM_CONSOLIDATION_BUFFERS_WEIGHT), diff --git a/tiledb/sm/config/config.h b/tiledb/sm/config/config.h index 5a1599647a9..6793a51781d 100644 --- a/tiledb/sm/config/config.h +++ b/tiledb/sm/config/config.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2017-2025 TileDB, Inc. + * @copyright Copyright (c) 2017-2026 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -292,6 +292,9 @@ class Config { /** Maximum memory budget for readers and writers. */ static const std::string SM_MEM_TOTAL_BUDGET; + /** Initial size for consolidation buffers before growth. */ + static const std::string SM_MEM_CONSOLIDATION_INITIAL_BUFFER_SIZE; + /** Weight for consolidation buffers used to split total budget. */ static const std::string SM_MEM_CONSOLIDATION_BUFFERS_WEIGHT; diff --git a/tiledb/sm/consolidator/fragment_consolidator.cc b/tiledb/sm/consolidator/fragment_consolidator.cc index db6f9bcad4d..ad287924297 100644 --- a/tiledb/sm/consolidator/fragment_consolidator.cc +++ b/tiledb/sm/consolidator/fragment_consolidator.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022-2025 TileDB, Inc. + * @copyright Copyright (c) 2022-2026 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -47,11 +47,21 @@ #include #include #include +#include using namespace tiledb::common; namespace tiledb::sm { +namespace intercept { +DEFINE_INTERCEPT( + fragment_consolidator_copy_array, + const uint64_t&, + uint64_t&, + uint64_t&, + bool); +} + class FragmentConsolidatorException : public StatusException { public: explicit FragmentConsolidatorException(const std::string& message) @@ -60,21 +70,15 @@ class FragmentConsolidatorException : public StatusException { }; FragmentConsolidationWorkspace::FragmentConsolidationWorkspace( - shared_ptr memory_tracker) + shared_ptr memory_tracker, + const FragmentConsolidationConfig& config, + const ArraySchema& array_schema, + std::unordered_map& avg_cell_sizes, + uint64_t total_buffers_budget) : backing_buffer_( memory_tracker->get_resource(MemoryType::CONSOLIDATION_BUFFERS)) , buffers_(memory_tracker->get_resource(MemoryType::CONSOLIDATION_BUFFERS)) , sizes_(memory_tracker->get_resource(MemoryType::CONSOLIDATION_BUFFERS)) { -} - -void FragmentConsolidationWorkspace::resize_buffers( - stats::Stats* stats, - const FragmentConsolidationConfig& config, - const ArraySchema& array_schema, - std::unordered_map& avg_cell_sizes, - uint64_t total_buffers_budget) { - auto timer_se = stats->start_timer("resize_buffers"); - // For easy reference auto attribute_num = array_schema.attribute_num(); auto& domain{array_schema.domain()}; @@ -248,8 +252,6 @@ Status FragmentConsolidator::consolidate( return st; } - FragmentConsolidationWorkspace cw(consolidator_memory_tracker_); - uint32_t step = 0; std::vector to_consolidate; do { @@ -282,8 +284,7 @@ Status FragmentConsolidator::consolidate( array_for_writes, to_consolidate, union_non_empty_domains, - &new_fragment_uri, - cw); + &new_fragment_uri); if (!st.ok()) { throw_if_not_ok(array_for_reads->close()); throw_if_not_ok(array_for_writes->close()); @@ -463,8 +464,6 @@ Status FragmentConsolidator::consolidate_fragments( } } - FragmentConsolidationWorkspace cw(consolidator_memory_tracker_); - // Consolidate the selected fragments URI new_fragment_uri; st = consolidate_internal( @@ -472,8 +471,7 @@ Status FragmentConsolidator::consolidate_fragments( array_for_writes, to_consolidate, union_non_empty_domains, - &new_fragment_uri, - cw); + &new_fragment_uri); if (!st.ok()) { throw_if_not_ok(array_for_reads->close()); throw_if_not_ok(array_for_writes->close()); @@ -581,8 +579,7 @@ Status FragmentConsolidator::consolidate_internal( shared_ptr array_for_writes, const std::vector& to_consolidate, const NDRange& union_non_empty_domains, - URI* new_fragment_uri, - FragmentConsolidationWorkspace& cw) { + URI* new_fragment_uri) { auto timer_se = stats_->start_timer("consolidate_internal"); array_for_reads->load_fragments(to_consolidate); @@ -626,19 +623,14 @@ Status FragmentConsolidator::consolidate_internal( uint64_t total_weights = config_.buffers_weight_ + config_.reader_weight_ + config_.writer_weight_; uint64_t single_unit_budget = config_.total_budget_ / total_weights; - uint64_t buffers_budget = config_.buffers_weight_ * single_unit_budget; + uint64_t buffer_budget = config_.buffers_weight_ * single_unit_budget; uint64_t reader_budget = config_.reader_weight_ * single_unit_budget; uint64_t writer_budget = config_.writer_weight_ * single_unit_budget; - // Prepare buffers - auto average_var_cell_sizes = array_for_reads->get_average_var_cell_sizes(); - cw.resize_buffers( - stats_, config_, array_schema, average_var_cell_sizes, buffers_budget); - // Create queries tdb_unique_ptr query_r = nullptr; tdb_unique_ptr query_w = nullptr; - throw_if_not_ok(create_queries( + RETURN_NOT_OK(create_queries( array_for_reads, array_for_writes, union_non_empty_domains, @@ -654,66 +646,327 @@ Status FragmentConsolidator::consolidate_internal( vac_uri = array_for_reads->array_directory().get_vacuum_uri(*new_fragment_uri); } catch (std::exception& e) { - FragmentConsolidatorException( + throw FragmentConsolidatorException( "Internal consolidation failed with exception" + std::string(e.what())); } - // Read from one array and write to the other - copy_array(query_r.get(), query_w.get(), cw); + // Consolidate fragments + try { + // Graciously attempt to consolidate by default. + // `initial_buffer_size`: + // if set, use deprecated `config_.buffer_size_`(-> `buffer_budget`). + // else use `config_.initial_buffer_size_`. + // max_queue_size: + // if user has set `initial_buffer_size`, allow growth up to total_budget + // else, cap growth at buffer_budget/2 + // note: divisor ensures pipeline parallelism. + uint64_t initial_buffer_size = config_.buffer_size_ != 0 ? + buffer_budget : + config_.initial_buffer_size_; + uint64_t cap = + config_.initial_buffer_size_user_set_ ? + std::min(config_.total_budget_, config_.initial_buffer_size_) : + buffer_budget / 2; + initial_buffer_size = std::min(initial_buffer_size, cap); + uint64_t max_queue_size = + config_.initial_buffer_size_user_set_ ? cap : buffer_budget; + + // Safeguard: estimate total read size (bytes) so the read doesn't run + // forever if the query submission never reports COMPLETE. + // Use the sum of fragment storage sizes (actual bytes on disk); + // if fragment sizes are not available, fall back to cell-based estimate. + // Use 20% error margin so we don't stop early if actual exceeds estimate. + uint64_t total_fragment_size = 0; + for (const auto& frag_md : array_for_reads->fragment_metadata()) { + total_fragment_size += frag_md->fragment_size(); + } + auto avg_var_sizes = array_for_reads->get_average_var_cell_sizes(); + uint64_t expected_total_bytes = 0; + constexpr uint64_t error_margin = 20; + if (total_fragment_size > 0) { + expected_total_bytes = + total_fragment_size + (total_fragment_size * error_margin / 100); + } else { + uint64_t total_cell_num = 0; + for (const auto& frag_md : array_for_reads->fragment_metadata()) { + total_cell_num += frag_md->cell_num(); + } + uint64_t bytes_per_cell = + compute_bytes_per_cell(array_schema, avg_var_sizes); + if (total_cell_num > 0) { + uint64_t cell_based = total_cell_num * bytes_per_cell; + expected_total_bytes = cell_based + (cell_based * error_margin / 100); + } + } - // Finalize write query - auto st = query_w->finalize(); - if (!st.ok()) { - if (resources_.vfs().is_dir(*new_fragment_uri)) + // Read from one array and write to the other + copy_array( + query_r.get(), + query_w.get(), + array_schema, + avg_var_sizes, + initial_buffer_size, + max_queue_size, + expected_total_bytes); + + // Write vacuum file + throw_if_not_ok(write_vacuum_file( + array_for_reads->array_schema_latest().write_version(), + array_for_reads->array_uri(), + vac_uri, + to_consolidate)); + } catch (...) { + if (resources_.vfs().is_dir(*new_fragment_uri)) { resources_.vfs().remove_dir(*new_fragment_uri); - return st; + } + std::rethrow_exception(std::current_exception()); } - // Write vacuum file - st = write_vacuum_file( - array_for_reads->array_schema_latest().write_version(), - array_for_reads->array_uri(), - vac_uri, - to_consolidate); - if (!st.ok()) { - if (resources_.vfs().is_dir(*new_fragment_uri)) - resources_.vfs().remove_dir(*new_fragment_uri); - return st; + return Status::Ok(); +} + +uint64_t FragmentConsolidator::compute_bytes_per_cell( + const ArraySchema& array_schema, + const std::unordered_map& average_var_cell_sizes) + const { + auto attribute_num = array_schema.attribute_num(); + auto& domain{array_schema.domain()}; + auto dim_num = array_schema.dim_num(); + auto sparse = !array_schema.dense(); + + std::vector buffer_weights; + buffer_weights.reserve(attribute_num * 3 + dim_num * 2 + 3); + for (unsigned i = 0; i < attribute_num; ++i) { + const auto attr = array_schema.attributes()[i]; + const auto var_size = attr->var_size(); + buffer_weights.emplace_back( + var_size ? constants::cell_var_offset_size : attr->cell_size()); + if (var_size) { + buffer_weights.emplace_back(average_var_cell_sizes.at(attr->name())); + } + if (attr->nullable()) { + buffer_weights.emplace_back(constants::cell_validity_size); + } + } + if (sparse) { + for (unsigned i = 0; i < dim_num; ++i) { + const auto dim = domain.dimension_ptr(i); + const auto var_size = dim->var_size(); + buffer_weights.emplace_back( + var_size ? constants::cell_var_offset_size : dim->coord_size()); + if (var_size) { + buffer_weights.emplace_back(average_var_cell_sizes.at(dim->name())); + } + } + } + if (config_.with_timestamps_ && sparse) { + buffer_weights.emplace_back(constants::timestamp_size); + } + if (config_.with_delete_meta_) { + buffer_weights.emplace_back(constants::timestamp_size); + buffer_weights.emplace_back(sizeof(uint64_t)); } - return st; + return static_cast(std::accumulate( + buffer_weights.begin(), buffer_weights.end(), static_cast(0))); } void FragmentConsolidator::copy_array( - Query* query_r, Query* query_w, FragmentConsolidationWorkspace& cw) { - auto timer_se = stats_->start_timer("consolidate_copy_array"); + Query* query_r, + Query* query_w, + const ArraySchema& reader_array_schema_latest, + std::unordered_map average_var_cell_sizes, + uint64_t initial_buffer_size, + uint64_t max_queue_size, + uint64_t expected_total_bytes) { + // The size of the buffers. + uint64_t buffer_size = initial_buffer_size; + if (buffer_size > max_queue_size) { + throw FragmentConsolidatorException( + "Consolidation read 0 cells; no progress can be made without " + "disrespecting the memory budget."); + } - // Set the read query buffers outside the repeated submissions. - // The Reader will reset the query buffer sizes to the original - // sizes, not the potentially smaller sizes of the results after - // the query submission. - set_query_buffers(query_r, cw); + // Deque which stores the buffers passed between the reader and writer. + // Total size of enqueued buffers may not exceed `max_queue_size`. + // The reader will enqueue until that limit, so adjust `buffer_size` + // via `Config::initial_buffer_size` to allow concurrrent in-flight buffers. + ProducerConsumerQueue, + std::exception_ptr>> + buffer_queue; + + // Recycled workspaces from the writer. + // Allows the reader to reuse the same buffer pointers the writer has + // recently-relinquished to reduce allocations and allow the query to advance. + ProducerConsumerQueue> + recycled_buffer_queue; + + // Total size of buffers currently in a queue (allocated or recycled). + // May not exceed `max_queue_size`. + // Updated by reader before push and by writer after pop (before submit), + // preventing underflow and allowing the reader to proceed. + std::atomic current_enqueued_size = 0; + + // Flag indicating an ongoing read. The reader will stop once set to `false`. + std::atomic reading = true; + + // Total number of bytes read across the copy operation. + // Used by the safeguard to ensure we do not read past `expected_total_bytes`. + std::atomic total_bytes_read{0}; + + // Reader + auto& io_tp = resources_.io_tp(); + ThreadPool::Task read_task = io_tp.execute([&] { + while (reading) { + // READ + try { + // Create the read query buffers, ensuring we never exceed the + // memory tracker's budget, even if `buffer_size` has grown. + // When possible, reuse recycled workspace to reduce allocations. + tdb_shared_ptr cw; + if (auto recycled = recycled_buffer_queue.try_pop(); + recycled.has_value()) { + cw = std::move(recycled).value(); + } else { + cw = tdb::make_shared( + HERE(), + consolidator_memory_tracker_, + config_, + reader_array_schema_latest, + average_var_cell_sizes, + std::min(buffer_size, max_queue_size)); + } - do { - // READ - throw_if_not_ok(query_r->submit()); - - // If Consolidation cannot make any progress, throw. The first buffer will - // always contain fixed size data, whether it is tile offsets for var size - // attribute/dimension or the actual fixed size data so we can use its size - // to know if any cells were written or not. - if (cw.sizes().at(0) == 0) { - throw FragmentConsolidatorException( - "Consolidation read 0 cells, no progress can be made"); + set_query_buffers(query_r, *cw.get()); + throw_if_not_ok(query_r->submit()); + + // Only continue if Consolidation can make progress. The first buffer + // will always contain fixed size data, whether it is tile offsets for + // var size attribute/dimension or the actual fixed size data so we can + // use its size to know if any cells were written or not. + if (cw->sizes().at(0) == 0) { + // Read complete with no more data: exit so the writer can drain. + if (query_r->status() != QueryStatus::INCOMPLETE) { + buffer_queue.drain(); + reading = false; + break; + } + if (buffer_size > max_queue_size) { + throw FragmentConsolidatorException( + "Consolidation read 0 cells; no progress can be made without " + "disrespecting the memory budget."); + } + // Grow the buffer and try again. + uint64_t next_buffer_size = std::min(2 * buffer_size, max_queue_size); + if (buffer_size >= max_queue_size && + next_buffer_size == buffer_size) { + // Already at cap and still getting 0 cells; cannot make progress. + throw FragmentConsolidatorException( + "Consolidation read 0 cells; no progress can be made without " + "disrespecting the memory budget."); + } + buffer_size = next_buffer_size; + if (current_enqueued_size != 0) { + // Wait until queue has room for the next chunk. + io_tp.wait_until([&]() { + INTERCEPT( + intercept::fragment_consolidator_copy_array, + current_enqueued_size.load(), + next_buffer_size, + max_queue_size, + true); // flag indicating buffer has grown. + return current_enqueued_size + next_buffer_size <= max_queue_size; + }); + } + continue; + } else { + // Update count before push so the writer never pops and subtracts + // before we've added (which would underflow `current_enqueued_size`). + current_enqueued_size += cw->total_buffer_size(); + total_bytes_read += cw->total_buffer_size(); + buffer_queue.push(cw); + } + + // Once the read is complete, drain the queue and exit the reader. + // Infinite loop safeguard: exit upon reading `expected_total_bytes`. + // Note: `drain()` shuts down the queue without removing elements. + // The write fiber will be notified and write the remaining chunks. + if (query_r->status() != QueryStatus::INCOMPLETE || + (expected_total_bytes > 0 && + total_bytes_read.load() >= expected_total_bytes)) { + buffer_queue.drain(); + reading = false; + break; + } + } catch (...) { + // Use a minimal size for exception tracking to maintain queue logic. + current_enqueued_size += 1; + // Enqueue caught-exceptions to be handled by the writer. + buffer_queue.push(std::current_exception()); + reading = false; + break; + } + // Wait until queue has room for the next chunk; then reader continues. + io_tp.wait_until([&]() { + INTERCEPT( + intercept::fragment_consolidator_copy_array, + current_enqueued_size.load(), + buffer_size, + max_queue_size, + false); // flag indicating buffer has NOT grown. + return current_enqueued_size + buffer_size <= max_queue_size; + }); + } + return Status::Ok(); + }); + + // Writer + while (true) { + // Allow ProducerConsumerQueue to wait for an element to be enqueued. + auto buffer_queue_element = buffer_queue.pop_back(); + if (!buffer_queue_element.has_value()) { + // Stop writing once the queue is empty. + break; } - // Set explicitly the write query buffers, as the sizes may have - // been altered by the read query. - set_query_buffers(query_w, cw); + auto& buffer = buffer_queue_element.value(); + // Rethrow read-enqueued exceptions. + if (std::holds_alternative(buffer)) { + // Stop the reader, draining the queue. + reading = false; + throw_if_not_ok(read_task.wait()); + std::rethrow_exception(std::get(buffer)); + } // WRITE - throw_if_not_ok(query_w->submit()); - } while (query_r->status() == QueryStatus::INCOMPLETE); + auto& writebuf = std::get<0>(buffer); + try { + // Explicitly set the write query buffers, as the sizes may have + // been altered by the read query. + set_query_buffers(query_w, *writebuf.get()); + throw_if_not_ok(query_w->submit()); + // Relinquish buffer back to the recycled queue once we are done with it. + recycled_buffer_queue.push(writebuf); + current_enqueued_size -= writebuf->total_buffer_size(); + // Note: there is an edge case in which the reader is stuck waiting for + // the writer submit to complete before `current_enqueued_size` is + // decremented. We could immediately decrement before the try-catch, but + // then the reader would always allocate another buffer while the writer + // is in `submit()`. It's best to always recycle the workspace. + } catch (...) { + // Stop the reader, draining the queue. + reading = false; + throw_if_not_ok(read_task.wait()); + throw; + } + } + + // Wait for reader to finish + throw_if_not_ok(read_task.wait()); + + // Finalize write query + throw_if_not_ok(query_w->finalize()); } Status FragmentConsolidator::create_queries( @@ -1022,6 +1275,11 @@ Status FragmentConsolidator::set_config(const Config& config) { } config_.total_budget_ = merged_config.get("sm.mem.total_budget", Config::must_find); + config_.initial_buffer_size_user_set_ = + merged_config.set_params().count( + "sm.mem.consolidation.initial_buffer_size") > 0; + config_.initial_buffer_size_ = merged_config.get( + "sm.mem.consolidation.initial_buffer_size", Config::must_find); config_.buffers_weight_ = merged_config.get( "sm.mem.consolidation.buffers_weight", Config::must_find); config_.reader_weight_ = merged_config.get( diff --git a/tiledb/sm/consolidator/fragment_consolidator.h b/tiledb/sm/consolidator/fragment_consolidator.h index 3fcae353d7e..0f927703088 100644 --- a/tiledb/sm/consolidator/fragment_consolidator.h +++ b/tiledb/sm/consolidator/fragment_consolidator.h @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022-2024 TileDB, Inc. + * @copyright Copyright (c) 2022-2026 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -37,6 +37,7 @@ #include "tiledb/common/heap_memory.h" #include "tiledb/common/pmr.h" #include "tiledb/common/status.h" +#include "tiledb/common/util/intercept.h" #include "tiledb/sm/array/array.h" #include "tiledb/sm/consolidator/consolidator.h" #include "tiledb/sm/misc/types.h" @@ -49,6 +50,15 @@ using namespace tiledb::common; namespace tiledb::sm { +namespace intercept { +DECLARE_INTERCEPT( + fragment_consolidator_copy_array, + const uint64_t&, + uint64_t&, + uint64_t&, + bool); +} + class ArraySchema; class Config; class Query; @@ -82,6 +92,10 @@ struct FragmentConsolidationConfig : Consolidator::ConsolidationConfigBase { uint64_t buffer_size_; /** Total memory budget for consolidation operation. */ uint64_t total_budget_; + /** Initial size of the consolidation buffers before growth. */ + uint64_t initial_buffer_size_; + /** True if user explicitly set sm.mem.consolidation.initial_buffer_size. */ + bool initial_buffer_size_user_set_; /** Consolidation buffers weight used to partition total budget. */ uint64_t buffers_weight_; /** Reader weight used to partition total budget. */ @@ -115,32 +129,27 @@ struct FragmentConsolidationConfig : Consolidator::ConsolidationConfigBase { */ class FragmentConsolidationWorkspace { public: - FragmentConsolidationWorkspace(shared_ptr memory_tracker); - - // Disable copy and move construction/assignment so we don't have - // to think about it. - DISABLE_COPY_AND_COPY_ASSIGN(FragmentConsolidationWorkspace); - DISABLE_MOVE_AND_MOVE_ASSIGN(FragmentConsolidationWorkspace); - /** - * Resize the buffers that will be used upon reading the input fragments and - * writing into the new fragment. It also retrieves the number of buffers - * created. + * Constructor. * - * @param stats The stats. + * @param memory_tracker The workspace's MemoryTracker. * @param config The consolidation config. * @param array_schema The array schema. * @param avg_cell_sizes The average cell sizes. * @param total_buffers_budget Total budget for the consolidation buffers. - * @return a consolidation workspace containing the buffers */ - void resize_buffers( - stats::Stats* stats, + FragmentConsolidationWorkspace( + shared_ptr memory_tracker, const FragmentConsolidationConfig& config, const ArraySchema& array_schema, std::unordered_map& avg_cell_sizes, uint64_t total_buffers_budget); + // Disable copy and move construction/assignment so we don't have + // to think about it. + DISABLE_COPY_AND_COPY_ASSIGN(FragmentConsolidationWorkspace); + DISABLE_MOVE_AND_MOVE_ASSIGN(FragmentConsolidationWorkspace); + /** Accessor for buffers. */ tdb::pmr::vector>& buffers() { return buffers_; @@ -151,6 +160,11 @@ class FragmentConsolidationWorkspace { return sizes_; }; + /** Accessor for the total allocated buffer size. */ + size_t total_buffer_size() const { + return backing_buffer_.size(); + } + private: /*** The backing buffer used for all buffers. */ tdb::pmr::vector backing_buffer_; @@ -293,6 +307,7 @@ class FragmentConsolidator : public Consolidator { * @param new_fragment_uri The URI of the fragment created after * consolidating the `to_consolidate` fragments. * @param cw A workspace containing buffers for the queries + * * @return Status */ Status consolidate_internal( @@ -300,20 +315,46 @@ class FragmentConsolidator : public Consolidator { shared_ptr array_for_writes, const std::vector& to_consolidate, const NDRange& union_non_empty_domains, - URI* new_fragment_uri, - FragmentConsolidationWorkspace& cw); + URI* new_fragment_uri); + + /** + * Copy of buffer-weight logic from `FragmentConsolidationWorkspace`. + * + * Estimates the total bytes per cell for incoming read buffers. + * + * @param array_schema The reader's latest array schema. + * @param avg_var_cell_sizes A map of the reader's computed average cell size + * for var size attrs / dims. + * @return An estimate of the total bytes per-cell of the read buffers. + */ + uint64_t compute_bytes_per_cell( + const ArraySchema& array_schema, + const std::unordered_map& average_var_cell_sizes) + const; /** - * Copies the array by reading from the fragments to be consolidated - * with `query_r` and writing to the new fragment with `query_w`. + * Copies the array by concurrently reading from the fragments to be + * consolidated with `query_r` and writing to the new fragment with `query_w`. * It also appropriately sets the query buffers. * * @param query_r The read query. * @param query_w The write query. - * @param cw A workspace containing buffers for the queries + * @param reader_array_schema_latest The reader's latest array schema. + * @param avg_var_cell_sizes A map of the reader's computed average cell size + * for var size attrs / dims. + * @param initial_buffer_size Initial size of consolidation buffers. + * @param max_queue_size Maximum total size of in-flight buffers in queue. + * For pipeline throughput, use at least 2x typical chunk (buffer) size. + * @param expected_total_bytes Estimated total read size; default 0 (unknown) */ void copy_array( - Query* query_r, Query* query_w, FragmentConsolidationWorkspace& cw); + Query* query_r, + Query* query_w, + const ArraySchema& reader_array_schema_latest, + std::unordered_map avg_var_cell_sizes, + uint64_t initial_buffer_size, + uint64_t max_queue_size, + uint64_t expected_total_bytes = 0); /** * Creates the queries needed for consolidation. It also retrieves diff --git a/tiledb/sm/consolidator/test/unit_fragment_consolidator.cc b/tiledb/sm/consolidator/test/unit_fragment_consolidator.cc index f67412aa495..76baa4b48ad 100644 --- a/tiledb/sm/consolidator/test/unit_fragment_consolidator.cc +++ b/tiledb/sm/consolidator/test/unit_fragment_consolidator.cc @@ -5,7 +5,7 @@ * * The MIT License * - * @copyright Copyright (c) 2022-2024 TileDB, Inc. + * @copyright Copyright (c) 2022-2026 TileDB, Inc. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -157,7 +157,6 @@ shared_ptr make_schema( TEST_CASE( "Fragment consolidator: test buffer creation", "[fragment_consolidator][create_buffers]") { - stats::Stats statistics("default"); shared_ptr schema = nullptr; std::vector expected_sizes; std::unordered_map avg_cell_sizes; @@ -222,8 +221,8 @@ TEST_CASE( cfg.with_delete_meta_ = with_delete_meta; cfg.buffer_size_ = 1000; - FragmentConsolidationWorkspace cw(tiledb::test::get_test_memory_tracker()); - cw.resize_buffers(&statistics, cfg, *schema, avg_cell_sizes, 1); + FragmentConsolidationWorkspace cw( + tiledb::test::get_test_memory_tracker(), cfg, *schema, avg_cell_sizes, 1); auto& buffers = cw.buffers(); auto& buffer_sizes = cw.sizes();