Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/llm/apis/openai_api_handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ class OpenAIApiHandler {
std::string getModel() const;
std::string getToolChoice() const;
const std::unique_ptr<OutputParser>& getOutputParser() const;
const CompletionUsageStatistics& getUsage() const { return usage; }

// Usage tracking
void setPromptTokensUsage(size_t promptTokens);
Expand Down
42 changes: 42 additions & 0 deletions src/llm/visual_language_model/continuous_batching/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "servable.hpp"

#include <algorithm>
#include <memory>
#include <stdexcept>
#include <string>
Expand All @@ -28,6 +29,29 @@

namespace ovms {

static void logVLMContinuousBatchingDebug(const std::shared_ptr<OpenAIApiHandler>& apiHandler, const ov::genai::GenerationHandle& generationHandle) {
const auto& usage = apiHandler->getUsage();
auto perfMetrics = generationHandle->get_vlm_perf_metrics();
constexpr double minPrefillDurationMs = 1e-9;
const double prepareEmbeddingsTimeMs = perfMetrics.get_prepare_embeddings_duration().mean;
const double llmTtftMs = perfMetrics.get_ttft().mean;
const double ttftMs = prepareEmbeddingsTimeMs + llmTtftMs;
const double prefillDurationMs = std::max(llmTtftMs, minPrefillDurationMs);
const double prefillSpeedTps = (1000.0 * perfMetrics.get_num_input_tokens()) / prefillDurationMs;

SPDLOG_LOGGER_DEBUG(
llm_calculator_logger,
"VLM continuous batching metrics | input_token_count: {} | output_token_count: {} | total_token_count: {} | prepare_embeddings_time_ms: {:.3f} | llm_ttft_ms: {:.3f} | ttft_ms: {:.3f} | prefill_speed_tps: {:.3f} | image_slice_count: {}",
perfMetrics.get_num_input_tokens(),
usage.completionTokens,
perfMetrics.get_num_input_tokens() + usage.completionTokens,
prepareEmbeddingsTimeMs,
llmTtftMs,
ttftMs,
prefillSpeedTps,
perfMetrics.get_image_slice_count());
}

absl::Status VisualLanguageModelServable::addRequestToPipeline(std::shared_ptr<ContinuousBatchingServableExecutionContext>& executionContext) {
auto vlmExecutionContext = std::static_pointer_cast<VisualLanguageModelServableExecutionContext>(executionContext);
vlmExecutionContext->generationHandle = properties->pipeline->add_request(currentRequestId++, // to be removed from API?
Expand Down Expand Up @@ -119,4 +143,22 @@ absl::Status VisualLanguageModelServable::prepareInputs(std::shared_ptr<GenAiSer

return absl::OkStatus();
}

absl::Status VisualLanguageModelServable::prepareCompleteResponse(std::shared_ptr<GenAiServableExecutionContext>& executionContext) {
auto status = GenAiServable::prepareCompleteResponse(executionContext);
if (status.ok()) {
auto vlmExecutionContext = std::static_pointer_cast<VisualLanguageModelServableExecutionContext>(executionContext);
logVLMContinuousBatchingDebug(executionContext->apiHandler, vlmExecutionContext->generationHandle);
}
return status;
}

absl::Status VisualLanguageModelServable::preparePartialResponse(std::shared_ptr<GenAiServableExecutionContext>& executionContext) {
auto status = GenAiServable::preparePartialResponse(executionContext);
if (status.ok() && !executionContext->sendLoopbackSignal) {
auto vlmExecutionContext = std::static_pointer_cast<VisualLanguageModelServableExecutionContext>(executionContext);
logVLMContinuousBatchingDebug(executionContext->apiHandler, vlmExecutionContext->generationHandle);
}
return status;
}
} // namespace ovms
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,7 @@ class VisualLanguageModelServable : public ContinuousBatchingServable {
std::shared_ptr<GenAiServableExecutionContext> createExecutionContext() override;
std::shared_ptr<GenAiServableProperties> getProperties() override;
absl::Status prepareInputs(std::shared_ptr<GenAiServableExecutionContext>& executionContext) override;
absl::Status prepareCompleteResponse(std::shared_ptr<GenAiServableExecutionContext>& executionContext) override;
absl::Status preparePartialResponse(std::shared_ptr<GenAiServableExecutionContext>& executionContext) override;
};
} // namespace ovms
20 changes: 20 additions & 0 deletions src/llm/visual_language_model/legacy/servable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
// limitations under the License.
//*****************************************************************************

#include <algorithm>
#include <memory>
#include <stdexcept>
#include <string>
Expand Down Expand Up @@ -46,6 +47,23 @@

namespace ovms {

static void logVLMPerfMetricsDebug(ov::genai::VLMPerfMetrics& perfMetrics) {
constexpr double minPrefillDurationMs = 1e-9;
const double prepareEmbeddingsTimeMs = perfMetrics.get_prepare_embeddings_duration().mean;
const double ttftMs = perfMetrics.get_ttft().mean;
const double prefillDurationMs = std::max(ttftMs - prepareEmbeddingsTimeMs, minPrefillDurationMs);
const double prefillSpeedTps = (1000.0 * perfMetrics.get_num_input_tokens()) / prefillDurationMs;

SPDLOG_LOGGER_DEBUG(
llm_calculator_logger,
"VLM perf metrics | input_token_count: {} | prepare_embeddings_time_ms: {:.3f} | ttft_ms: {:.3f} | prefill_speed_tps: {:.3f} | image_slice_count: {}",
perfMetrics.get_num_input_tokens(),
prepareEmbeddingsTimeMs,
ttftMs,
prefillSpeedTps,
perfMetrics.get_image_slice_count());
}

absl::Status VisualLanguageModelLegacyServable::loadRequest(std::shared_ptr<GenAiServableExecutionContext>& executionContext, const ovms::HttpPayload& payload) {
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Request body: {}", payload.body);
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Request uri: {}", payload.uri);
Expand Down Expand Up @@ -196,6 +214,7 @@ absl::Status VisualLanguageModelLegacyServable::prepareCompleteResponse(std::sha
}

executionContext->response = executionContext->apiHandler->serializeUnaryResponse(legacyExecutionContext->results, completeText);
logVLMPerfMetricsDebug(legacyExecutionContext->results.perf_metrics);
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Complete unary response: {}", executionContext->response);
return absl::OkStatus();
}
Expand Down Expand Up @@ -256,6 +275,7 @@ absl::Status VisualLanguageModelLegacyServable::preparePartialResponse(std::shar

executionContext->response += wrapTextInServerSideEventMessage("[DONE]");

logVLMPerfMetricsDebug(legacyExecutionContext->results.perf_metrics);
SPDLOG_LOGGER_DEBUG(llm_calculator_logger, "Generated complete streaming response: {}", executionContext->response);
executionContext->sendLoopbackSignal = false;
return absl::OkStatus();
Expand Down
Loading