From e4f1caf54010a43d106e4ed7c46082bc9be01897 Mon Sep 17 00:00:00 2001 From: NIK-TIGER-BILL <59732804+NIK-TIGER-BILL@users.noreply.github.com> Date: Sat, 23 May 2026 23:14:10 +0000 Subject: [PATCH 1/2] fix: exclude failed queries from aggregate score in evaluators FaithfulnessEvaluator and ContextRelevanceEvaluator previously included NaN scores from failed LLM calls when computing the aggregate mean, causing the overall score to silently become NaN. Now failed queries are excluded and a warning is logged. Fixes #11383 Signed-off-by: NIK-TIGER-BILL <59732804+NIK-TIGER-BILL@users.noreply.github.com> --- .../components/evaluators/context_relevance.py | 14 +++++++++++--- haystack/components/evaluators/faithfulness.py | 14 +++++++++++--- .../notes/fix-evaluator-nan-scores-abc123.yaml | 3 +++ .../evaluators/test_context_relevance_evaluator.py | 9 ++++++--- .../evaluators/test_faithfulness_evaluator.py | 9 ++++++--- 5 files changed, 37 insertions(+), 12 deletions(-) create mode 100644 releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index f5db0655e4..7830d847ca 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -2,15 +2,18 @@ # # SPDX-License-Identifier: Apache-2.0 +import math from statistics import mean from typing import Any -from haystack import component, default_from_dict, default_to_dict +from haystack import component, default_from_dict, default_to_dict, logging from haystack.components.evaluators.llm_evaluator import LLMEvaluator from haystack.components.generators.chat.types import ChatGenerator from haystack.core.serialization import component_to_dict from haystack.utils import deserialize_chatgenerator_inplace +logger = logging.getLogger(__name__) + # Private global variable for default examples to include in the prompt if the user does not provide any examples _DEFAULT_EXAMPLES = [ { @@ -182,8 +185,13 @@ def run(self, **inputs: Any) -> dict[str, Any]: res["score"] = 0 # calculate average context relevance score over all queries - result["score"] = mean([res["score"] for res in result["results"]]) - result["individual_scores"] = [res["score"] for res in result["results"]] # useful for the EvaluationRunResult + scores = [res["score"] for res in result["results"]] + valid_scores = [s for s in scores if not math.isnan(s)] + skipped = len(scores) - len(valid_scores) + if skipped: + logger.warning("%s query(s) failed and were excluded from the score.", skipped) + result["score"] = mean(valid_scores) if valid_scores else float("nan") + result["individual_scores"] = scores # useful for the EvaluationRunResult return result diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 99b622160f..2fe0d005a8 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -2,16 +2,19 @@ # # SPDX-License-Identifier: Apache-2.0 +import math from typing import Any from numpy import mean as np_mean -from haystack import component, default_from_dict, default_to_dict +from haystack import component, default_from_dict, default_to_dict, logging from haystack.components.evaluators.llm_evaluator import LLMEvaluator from haystack.components.generators.chat.types import ChatGenerator from haystack.core.serialization import component_to_dict from haystack.utils import deserialize_chatgenerator_inplace +logger = logging.getLogger(__name__) + # Default examples to include in the prompt if the user does not provide any examples _DEFAULT_EXAMPLES = [ { @@ -176,8 +179,13 @@ def run(self, **inputs: Any) -> dict[str, Any]: res["score"] = np_mean(res["statement_scores"]) # calculate average answer faithfulness score over all queries - result["score"] = np_mean([res["score"] for res in result["results"]]) - result["individual_scores"] = [res["score"] for res in result["results"]] + scores = [res["score"] for res in result["results"]] + valid_scores = [s for s in scores if not math.isnan(s)] + skipped = len(scores) - len(valid_scores) + if skipped: + logger.warning("%s query(s) failed and were excluded from the score.", skipped) + result["score"] = np_mean(valid_scores) if valid_scores else float("nan") + result["individual_scores"] = scores return result diff --git a/releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml b/releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml new file mode 100644 index 0000000000..09292ba3d1 --- /dev/null +++ b/releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml @@ -0,0 +1,3 @@ +fixes: + - | + Fixed ``FaithfulnessEvaluator`` and ``ContextRelevanceEvaluator`` to exclude failed queries (score ``nan``) from the aggregate mean score instead of silently returning ``nan``. A warning is now logged when one or more queries are skipped. diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py index 9eca698879..ae145c0779 100644 --- a/test/components/evaluators/test_context_relevance_evaluator.py +++ b/test/components/evaluators/test_context_relevance_evaluator.py @@ -206,7 +206,7 @@ def test_run_missing_parameters(self, monkeypatch): with pytest.raises(ValueError, match="LLM evaluator expected input parameter"): component.run() - def test_run_returns_nan_raise_on_failure_false(self, monkeypatch): + def test_run_returns_nan_raise_on_failure_false(self, monkeypatch, caplog): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = ContextRelevanceEvaluator(raise_on_failure=False) @@ -231,13 +231,16 @@ def chat_generator_run(self, *args, **kwargs): "programmers write clear, logical code for both small and large-scale software projects." ], ] - results = component.run(questions=questions, contexts=contexts) + with caplog.at_level("WARNING", logger="haystack.components.evaluators.context_relevance"): + results = component.run(questions=questions, contexts=contexts) - assert math.isnan(results["score"]) + assert results["score"] == 1 assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1} assert results["results"][1]["relevant_statements"] == [] assert math.isnan(results["results"][1]["score"]) + assert "1 query(s) failed and were excluded from the score." in caplog.text + @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index 64d113462a..4ae555a12a 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -253,7 +253,7 @@ def test_run_missing_parameters(self, monkeypatch): with pytest.raises(ValueError, match="LLM evaluator expected input parameter"): component.run() - def test_run_returns_nan_raise_on_failure_false(self, monkeypatch): + def test_run_returns_nan_raise_on_failure_false(self, monkeypatch, caplog): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = FaithfulnessEvaluator(raise_on_failure=False) @@ -282,9 +282,10 @@ def chat_generator_run(self, *args, **kwargs): "Football is the most popular sport with around 4 billion followers worldwide.", "Guido van Rossum.", ] - results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) + with caplog.at_level("WARNING", logger="haystack.components.evaluators.faithfulness"): + results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) - assert math.isnan(results["score"]) + assert results["score"] == 1.0 assert results["individual_scores"][0] == 1.0 assert math.isnan(results["individual_scores"][1]) @@ -295,6 +296,8 @@ def chat_generator_run(self, *args, **kwargs): assert results["results"][1]["statement_scores"] == [] assert math.isnan(results["results"][1]["score"]) + assert "1 query(s) failed and were excluded from the score." in caplog.text + @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", From 17163fccfa8de8eaeff04a29e478aacc4fc8ba49 Mon Sep 17 00:00:00 2001 From: Julian Risch Date: Fri, 5 Jun 2026 15:58:54 +0200 Subject: [PATCH 2/2] fix: use Haystack logging convention for skipped-query warning The warning for excluded NaN scores in FaithfulnessEvaluator and ContextRelevanceEvaluator used %s positional formatting, which Haystack's keyword-only logger rejects with a TypeError, crashing run() in exactly the failed-query path this change is meant to handle. Switch to {}-style interpolation with a keyword argument. Also give the release note a proper reno hash filename and add the missing YAML document start marker. --- haystack/components/evaluators/context_relevance.py | 2 +- haystack/components/evaluators/faithfulness.py | 2 +- ...c123.yaml => fix-evaluator-nan-scores-ba6953344443c788.yaml} | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) rename releasenotes/notes/{fix-evaluator-nan-scores-abc123.yaml => fix-evaluator-nan-scores-ba6953344443c788.yaml} (98%) diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index 7830d847ca..158d108d45 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -189,7 +189,7 @@ def run(self, **inputs: Any) -> dict[str, Any]: valid_scores = [s for s in scores if not math.isnan(s)] skipped = len(scores) - len(valid_scores) if skipped: - logger.warning("%s query(s) failed and were excluded from the score.", skipped) + logger.warning("{skipped} query(s) failed and were excluded from the score.", skipped=skipped) result["score"] = mean(valid_scores) if valid_scores else float("nan") result["individual_scores"] = scores # useful for the EvaluationRunResult diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 2fe0d005a8..f3328569af 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -183,7 +183,7 @@ def run(self, **inputs: Any) -> dict[str, Any]: valid_scores = [s for s in scores if not math.isnan(s)] skipped = len(scores) - len(valid_scores) if skipped: - logger.warning("%s query(s) failed and were excluded from the score.", skipped) + logger.warning("{skipped} query(s) failed and were excluded from the score.", skipped=skipped) result["score"] = np_mean(valid_scores) if valid_scores else float("nan") result["individual_scores"] = scores diff --git a/releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml b/releasenotes/notes/fix-evaluator-nan-scores-ba6953344443c788.yaml similarity index 98% rename from releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml rename to releasenotes/notes/fix-evaluator-nan-scores-ba6953344443c788.yaml index 09292ba3d1..9a88c0c0c1 100644 --- a/releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml +++ b/releasenotes/notes/fix-evaluator-nan-scores-ba6953344443c788.yaml @@ -1,3 +1,4 @@ +--- fixes: - | Fixed ``FaithfulnessEvaluator`` and ``ContextRelevanceEvaluator`` to exclude failed queries (score ``nan``) from the aggregate mean score instead of silently returning ``nan``. A warning is now logged when one or more queries are skipped.