From 32bbc5a0674ac6c3ccef64146ade83b920f2392d Mon Sep 17 00:00:00 2001 From: junruizh2021 Date: Fri, 15 May 2026 10:37:09 +0800 Subject: [PATCH] Log VLM perf metrics at DEBUG level for legacy and continuous batching pipelines Add debug-level logging of VLM performance metrics including: - input_token_count, prepare_embeddings_time_ms, ttft_ms, prefill_speed_tps - image_slice_count (from GenAI VLMPerfMetrics) - For CB pipeline: output_token_count, llm_ttft_ms, total_token_count Legacy VLM pipeline logs metrics after unary and streaming responses. Continuous batching VLM pipeline retrieves request-level metrics via GenerationHandle::get_vlm_perf_metrics() (requires openvinotoolkit/openvino.genai#3860). CVS-186710 --- src/llm/apis/openai_api_handler.hpp | 1 + .../continuous_batching/servable.cpp | 42 +++++++++++++++++++ .../continuous_batching/servable.hpp | 2 + .../visual_language_model/legacy/servable.cpp | 20 +++++++++ 4 files changed, 65 insertions(+) diff --git a/src/llm/apis/openai_api_handler.hpp b/src/llm/apis/openai_api_handler.hpp index 7c56bcbf95..33616be30f 100644 --- a/src/llm/apis/openai_api_handler.hpp +++ b/src/llm/apis/openai_api_handler.hpp @@ -155,6 +155,7 @@ class OpenAIApiHandler { std::string getModel() const; std::string getToolChoice() const; const std::unique_ptr& getOutputParser() const; + const CompletionUsageStatistics& getUsage() const { return usage; } // Usage tracking void setPromptTokensUsage(size_t promptTokens); diff --git a/src/llm/visual_language_model/continuous_batching/servable.cpp b/src/llm/visual_language_model/continuous_batching/servable.cpp index 0ef06d22df..5530cfb85b 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.cpp +++ b/src/llm/visual_language_model/continuous_batching/servable.cpp @@ -16,6 +16,7 @@ #include "servable.hpp" +#include #include #include #include @@ -28,6 +29,29 @@ namespace ovms { +static void logVLMContinuousBatchingDebug(const std::shared_ptr& apiHandler, const ov::genai::GenerationHandle& generationHandle) { + const auto& usage = apiHandler->getUsage(); + auto perfMetrics = generationHandle->get_vlm_perf_metrics(); + constexpr double minPrefillDurationMs = 1e-9; + const double prepareEmbeddingsTimeMs = perfMetrics.get_prepare_embeddings_duration().mean; + const double llmTtftMs = perfMetrics.get_ttft().mean; + const double ttftMs = prepareEmbeddingsTimeMs + llmTtftMs; + const double prefillDurationMs = std::max(llmTtftMs, minPrefillDurationMs); + const double prefillSpeedTps = (1000.0 * perfMetrics.get_num_input_tokens()) / prefillDurationMs; + + SPDLOG_LOGGER_DEBUG( + llm_calculator_logger, + "VLM continuous batching metrics | input_token_count: {} | output_token_count: {} | total_token_count: {} | prepare_embeddings_time_ms: {:.3f} | llm_ttft_ms: {:.3f} | ttft_ms: {:.3f} | prefill_speed_tps: {:.3f} | image_slice_count: {}", + perfMetrics.get_num_input_tokens(), + usage.completionTokens, + perfMetrics.get_num_input_tokens() + usage.completionTokens, + prepareEmbeddingsTimeMs, + llmTtftMs, + ttftMs, + prefillSpeedTps, + perfMetrics.get_image_slice_count()); +} + absl::Status VisualLanguageModelServable::addRequestToPipeline(std::shared_ptr& executionContext) { auto vlmExecutionContext = std::static_pointer_cast(executionContext); vlmExecutionContext->generationHandle = properties->pipeline->add_request(currentRequestId++, // to be removed from API? @@ -119,4 +143,22 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr& executionContext) { + auto status = GenAiServable::prepareCompleteResponse(executionContext); + if (status.ok()) { + auto vlmExecutionContext = std::static_pointer_cast(executionContext); + logVLMContinuousBatchingDebug(executionContext->apiHandler, vlmExecutionContext->generationHandle); + } + return status; +} + +absl::Status VisualLanguageModelServable::preparePartialResponse(std::shared_ptr& executionContext) { + auto status = GenAiServable::preparePartialResponse(executionContext); + if (status.ok() && !executionContext->sendLoopbackSignal) { + auto vlmExecutionContext = std::static_pointer_cast(executionContext); + logVLMContinuousBatchingDebug(executionContext->apiHandler, vlmExecutionContext->generationHandle); + } + return status; +} } // namespace ovms diff --git a/src/llm/visual_language_model/continuous_batching/servable.hpp b/src/llm/visual_language_model/continuous_batching/servable.hpp index a50b5c340d..824c0b2326 100644 --- a/src/llm/visual_language_model/continuous_batching/servable.hpp +++ b/src/llm/visual_language_model/continuous_batching/servable.hpp @@ -52,5 +52,7 @@ class VisualLanguageModelServable : public ContinuousBatchingServable { std::shared_ptr createExecutionContext() override; std::shared_ptr getProperties() override; absl::Status prepareInputs(std::shared_ptr& executionContext) override; + absl::Status prepareCompleteResponse(std::shared_ptr& executionContext) override; + absl::Status preparePartialResponse(std::shared_ptr& executionContext) override; }; } // namespace ovms diff --git a/src/llm/visual_language_model/legacy/servable.cpp b/src/llm/visual_language_model/legacy/servable.cpp index 1bb2367001..f98e10dc0f 100644 --- a/src/llm/visual_language_model/legacy/servable.cpp +++ b/src/llm/visual_language_model/legacy/servable.cpp @@ -14,6 +14,7 @@ // limitations under the License. //***************************************************************************** +#include #include #include #include @@ -46,6 +47,23 @@ namespace ovms { +static void logVLMPerfMetricsDebug(ov::genai::VLMPerfMetrics& perfMetrics) { + constexpr double minPrefillDurationMs = 1e-9; + const double prepareEmbeddingsTimeMs = perfMetrics.get_prepare_embeddings_duration().mean; + const double ttftMs = perfMetrics.get_ttft().mean; + const double prefillDurationMs = std::max(ttftMs - prepareEmbeddingsTimeMs, minPrefillDurationMs); + const double prefillSpeedTps = (1000.0 * perfMetrics.get_num_input_tokens()) / prefillDurationMs; + + SPDLOG_LOGGER_DEBUG( + llm_calculator_logger, + "VLM perf metrics | input_token_count: {} | prepare_embeddings_time_ms: {:.3f} | ttft_ms: {:.3f} | prefill_speed_tps: {:.3f} | image_slice_count: {}", + perfMetrics.get_num_input_tokens(), + prepareEmbeddingsTimeMs, + ttftMs, + prefillSpeedTps, + perfMetrics.get_image_slice_count()); +} + absl::Status VisualLanguageModelLegacyServable::loadRequest(std::shared_ptr& executionContext, const ovms::HttpPayload& payload) { SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Request body: {}", payload.body); SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Request uri: {}", payload.uri); @@ -196,6 +214,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareCompleteResponse(std::sha } executionContext->response = executionContext->apiHandler->serializeUnaryResponse(legacyExecutionContext->results, completeText); + logVLMPerfMetricsDebug(legacyExecutionContext->results.perf_metrics); SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Complete unary response: {}", executionContext->response); return absl::OkStatus(); } @@ -256,6 +275,7 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar executionContext->response += wrapTextInServerSideEventMessage("[DONE]"); + logVLMPerfMetricsDebug(legacyExecutionContext->results.perf_metrics); SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Generated complete streaming response: {}", executionContext->response); executionContext->sendLoopbackSignal = false; return absl::OkStatus();