diff --git a/include/my_atomic_wrapper.h b/include/my_atomic_wrapper.h index 7b35b14d3b79a..042b9966c4bd8 100644 --- a/include/my_atomic_wrapper.h +++ b/include/my_atomic_wrapper.h @@ -63,6 +63,10 @@ template class Atomic_relaxed std::memory_order o1= std::memory_order_relaxed, std::memory_order o2= std::memory_order_relaxed) { return m.compare_exchange_strong(i1, i2, o1, o2); } + bool compare_exchange_weak(Type& i1, const Type i2, + std::memory_order o1= std::memory_order_relaxed, + std::memory_order o2= std::memory_order_relaxed) + { return m.compare_exchange_weak(i1, i2, o1, o2); } Type exchange(const Type i, std::memory_order o= std::memory_order_relaxed) { return m.exchange(i, o); } }; diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 00cbe03d56157..15083f0c0e30f 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -61,8 +61,10 @@ static constexpr ulint buf_flush_lsn_scan_factor = 3; /** Average redo generation rate */ static lsn_t lsn_avg_rate = 0; -/** Target oldest_modification for the page cleaner background flushing; -writes are protected by buf_pool.flush_list_mutex */ +/** Target oldest_modification for the page cleaner background flushing. +Bumped lock-free via monotonic-max CAS-loop from buf_flush_ahead(); +cleared with CAS under buf_pool.flush_list_mutex from +buf_flush_page_cleaner() and buf_flush_sync_for_checkpoint(). */ static Atomic_relaxed buf_flush_async_lsn; /** Target oldest_modification for the page cleaner furious flushing; writes are protected by buf_pool.flush_list_mutex */ @@ -112,6 +114,7 @@ static void buf_flush_validate_skip() noexcept void buf_pool_t::page_cleaner_wakeup(bool for_LRU) noexcept { + mysql_mutex_assert_owner(&flush_list_mutex); ut_d(buf_flush_validate_skip()); if (!page_cleaner_idle()) { @@ -154,7 +157,7 @@ void buf_pool_t::page_cleaner_wakeup(bool for_LRU) noexcept last_activity_count == srv_get_activity_count())) || srv_max_buf_pool_modified_pct <= dirty_pct) { - page_cleaner_status-= PAGE_CLEANER_IDLE; + page_cleaner_idle_flag= false; pthread_cond_signal(&do_flush_list); } } @@ -2113,33 +2116,55 @@ ATTRIBUTE_COLD void buf_flush_ahead(lsn_t lsn, bool furious) noexcept DBUG_EXECUTE_IF("ib_log_checkpoint_avoid_hard", return;); - Atomic_relaxed &limit= furious - ? buf_flush_sync_lsn : buf_flush_async_lsn; + if (!furious) + { + /* Non-furious: lock-free monotonic-max via CAS-loop. */ + lsn_t cur= buf_flush_async_lsn; + while (cur < lsn) + if (buf_flush_async_lsn.compare_exchange_weak(cur, lsn)) + goto bumped; + return; + + bumped: + /* In non-furious mode, concurrent writes to the log will remain + possible, and we are gently requesting buf_flush_page_cleaner() + to do more work to avoid a later call with furious=true. + We will only wake the buf_flush_page_cleaner() from an indefinite + my_cond_wait(), but we will not disturb the regular 1-second sleep. + + To reduce contention for a gentle request case, the idle flag of + buf_pool is read outside of a buf_pool.flush_list_mutex critical + section, allowing early return. + + If the decision to return early was (unlikely) wrong, subsequent + wake-side paths (including further buf_flush_ahead() calls under + redo-log pressure) are expected to perform the wake eventually. */ + if (!buf_pool.page_cleaner_idle()) + return; - if (limit < lsn) + /* Signal under buf_pool.flush_list_mutex. */ + mysql_mutex_lock(&buf_pool.flush_list_mutex); + goto wake; + } + + /* The furious path forces a log checkpoint via + log_sys.set_check_for_checkpoint(true), so the buf_flush_sync_lsn + bump, the checkpoint flag, and the cleaner wake must all be observed + together by any thread synchronising on buf_pool.flush_list_mutex. */ + if (buf_flush_sync_lsn < lsn) { mysql_mutex_lock(&buf_pool.flush_list_mutex); - if (limit < lsn) + if (buf_flush_sync_lsn < lsn) { - limit= lsn; - if (furious) - { - /* Request any concurrent threads to wait for this batch to complete, - in log_free_check(). */ - log_sys.set_check_for_checkpoint(true); - /* Immediately wake up buf_flush_page_cleaner(), even when it - is in the middle of a 1-second my_cond_timedwait(). */ - wake: - buf_pool.page_cleaner_set_idle(false); - pthread_cond_signal(&buf_pool.do_flush_list); - } - else if (buf_pool.page_cleaner_idle()) - /* In non-furious mode, concurrent writes to the log will remain - possible, and we are gently requesting buf_flush_page_cleaner() - to do more work to avoid a later call with furious=true. - We will only wake the buf_flush_page_cleaner() from an indefinite - my_cond_wait(), but we will not disturb the regular 1-second sleep. */ - goto wake; + buf_flush_sync_lsn= lsn; + /* Request any concurrent threads to wait for this batch to complete, + in log_free_check(). */ + log_sys.set_check_for_checkpoint(true); + /* Immediately wake up buf_flush_page_cleaner(), even when it + is in the middle of a 1-second my_cond_timedwait(). */ + wake: + buf_pool.page_cleaner_set_idle(false); + pthread_cond_signal(&buf_pool.do_flush_list); } mysql_mutex_unlock(&buf_pool.flush_list_mutex); } @@ -2229,8 +2254,15 @@ static void buf_flush_sync_for_checkpoint(lsn_t lsn) noexcept /* After attempting log checkpoint, check if we have reached our target. */ if (measure >= buf_flush_sync_lsn) buf_flush_sync_lsn= 0; - else if (measure >= buf_flush_async_lsn) - buf_flush_async_lsn= 0; + else + { + /* Same coordination as in buf_flush_page_cleaner(): clear only if + unchanged since the snapshot, so a concurrent buf_flush_ahead() bump + is not lost. */ + lsn_t async= buf_flush_async_lsn; + if (measure >= async) + buf_flush_async_lsn.compare_exchange_strong(async, 0); + } log_sys.latch.wr_unlock(); /* wake up buf_flush_wait() */ @@ -2582,7 +2614,14 @@ static void buf_flush_page_cleaner() noexcept if (UNIV_UNLIKELY(soft_lsn_limit != 0)) { if (oldest_lsn >= soft_lsn_limit) - buf_flush_async_lsn= soft_lsn_limit= 0; + { + /* Coordinate with the lock-free bumper in buf_flush_ahead(): + only clear if no concurrent thread has raised the target above + the snapshot we observed. If the CAS fails, the bumped value + survives and the next iteration picks it up. */ + buf_flush_async_lsn.compare_exchange_strong(soft_lsn_limit, 0); + soft_lsn_limit= 0; + } } else if (buf_pool.need_LRU_eviction()) { @@ -2605,8 +2644,13 @@ static void buf_flush_page_cleaner() noexcept oldest_lsn= buf_pool.get_oldest_modification(0); if (!oldest_lsn) goto fully_unemployed; - if (oldest_lsn >= buf_flush_async_lsn) - buf_flush_async_lsn= 0; + { + /* Same coordination as above: clear only if unchanged since + the snapshot, so a concurrent buf_flush_ahead() bump is not lost. */ + lsn_t async= buf_flush_async_lsn; + if (oldest_lsn >= async) + buf_flush_async_lsn.compare_exchange_strong(async, 0); + } buf_pool.page_cleaner_set_idle(false); goto set_almost_idle; } @@ -2652,6 +2696,14 @@ static void buf_flush_page_cleaner() noexcept } else if (dirty_pct < srv_max_buf_pool_modified_pct) possibly_unemployed: + /* Snapshot-based; no coordination with concurrent lock-free + buf_flush_ahead() bumpers. A bumper that observed + page_cleaner_idle()==false earlier in this iteration took its + early return; flipping idle=true here leaves its target + unacknowledged until another wake path fires. Same premise as the + bumper's early return: under sustained redo-log pressure, + subsequent buf_flush_ahead() calls observe + page_cleaner_idle()==true and take the wake path. */ if (!soft_lsn_limit && !af_needed_for_redo(oldest_lsn)) goto set_idle; diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 8140b1b7a0f36..fea52275a305d 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -1532,13 +1532,18 @@ class buf_pool_t TPOOL_SUPPRESS_TSAN void add_flush_list_requests(size_t size) { ut_ad(size); flush_list_requests+= size; } private: - static constexpr unsigned PAGE_CLEANER_IDLE= 1; static constexpr unsigned FLUSH_LIST_ACTIVE= 2; static constexpr unsigned LRU_FLUSH= 4; - /** Number of pending LRU flush * LRU_FLUSH + - PAGE_CLEANER_IDLE + FLUSH_LIST_ACTIVE flags */ + /** Number of pending LRU flush * LRU_FLUSH + FLUSH_LIST_ACTIVE flag. + Protected by flush_list_mutex. */ unsigned page_cleaner_status; + + /** Whether the page cleaner is sleeping due to being idle. + Writes occur under flush_list_mutex; reads are lock-free (used by + the early-return in buf_flush_ahead() on a busy cleaner). */ + Atomic_relaxed page_cleaner_idle_flag; + /** track server activity count for signaling idle flushing */ ulint last_activity_count; public: @@ -1580,19 +1585,19 @@ class buf_pool_t page_cleaner_status-= FLUSH_LIST_ACTIVE; } - /** @return whether the page cleaner must sleep due to being idle */ + /** @return whether the page cleaner must sleep due to being idle. + Lock-free read of page_cleaner_idle_flag; safe to call without + flush_list_mutex (used by the early-return in buf_flush_ahead()). */ bool page_cleaner_idle() const noexcept { - mysql_mutex_assert_owner(&flush_list_mutex); - return page_cleaner_status & PAGE_CLEANER_IDLE; + return page_cleaner_idle_flag; } /** @return whether the page cleaner may be initiating writes */ bool page_cleaner_active() const noexcept { mysql_mutex_assert_owner(&flush_list_mutex); - static_assert(PAGE_CLEANER_IDLE == 1, "efficiency"); - return page_cleaner_status > PAGE_CLEANER_IDLE; + return page_cleaner_status != 0; } /** Wake up the page cleaner if needed. @@ -1603,8 +1608,7 @@ class buf_pool_t void page_cleaner_set_idle(bool deep_sleep) noexcept { mysql_mutex_assert_owner(&flush_list_mutex); - page_cleaner_status= (page_cleaner_status & ~PAGE_CLEANER_IDLE) | - (PAGE_CLEANER_IDLE * deep_sleep); + page_cleaner_idle_flag= deep_sleep; } /** Update server last activity count */