From ea7dab3b29533d1e6b4d96c1588b00d37d0f32a6 Mon Sep 17 00:00:00 2001 From: Sihyeon Jang Date: Fri, 10 Apr 2026 11:24:35 +0900 Subject: [PATCH] fix(prometheus): flush expired slab memory in exporter timer When metrics are registered with an `expire` value, nginx's slab allocator marks entries as logically expired but does not return the underlying slab pages to the free-space pool automatically. As a result, `free_space_bytes` decreases monotonically over time even when many time series have expired, because slabs are only reclaimed when a flush is explicitly requested. Call `dict:flush_expired(1000)` in `exporter_timer` (which runs every `refresh_interval`, defaulting to 15 s) so that expired slabs are reclaimed promptly. The `max_count=1000` argument bounds the write-lock hold time to a few milliseconds per call, avoiding any noticeable impact on worker request processing. Fixes the pattern where `apisix_shared_dict_free_space_bytes` for `prometheus-metrics` decreases continuously until the dict is exhausted, even though active time-series counts fluctuate normally. --- apisix/plugins/prometheus/exporter.lua | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/apisix/plugins/prometheus/exporter.lua b/apisix/plugins/prometheus/exporter.lua index 2d4ed346e8fc..70f03d05daa9 100644 --- a/apisix/plugins/prometheus/exporter.lua +++ b/apisix/plugins/prometheus/exporter.lua @@ -613,6 +613,17 @@ local function exporter_timer(premature, yieldable, cache_exptime) return end + -- Explicitly flush expired entries from the prometheus-metrics shared dict to reclaim + -- slab memory. Setting `expire` on metrics causes logical expiry but nginx's slab + -- allocator does not return freed slabs to the free-space pool automatically. + -- Without this call, free_space_bytes decreases monotonically even as time series + -- expire, because the slabs are only reclaimed when explicitly flushed. + -- max_count=1000 bounds the write-lock hold time to a few milliseconds per cycle. + local prom_dict = ngx.shared["prometheus-metrics"] + if prom_dict then + prom_dict:flush_expired(1000) + end + -- Clear the cached data after cache_exptime to prevent stale data in case of an error. local _, err, forcible = shdict_prometheus_cache:set(CACHED_METRICS_KEY, res, cache_exptime) if err then