From e4f1caf54010a43d106e4ed7c46082bc9be01897 Mon Sep 17 00:00:00 2001
From: NIK-TIGER-BILL <59732804+NIK-TIGER-BILL@users.noreply.github.com>
Date: Sat, 23 May 2026 23:14:10 +0000
Subject: [PATCH 1/2] fix: exclude failed queries from aggregate score in
 evaluators

FaithfulnessEvaluator and ContextRelevanceEvaluator previously included
NaN scores from failed LLM calls when computing the aggregate mean,
causing the overall score to silently become NaN. Now failed queries
are excluded and a warning is logged.

Fixes #11383

Signed-off-by: NIK-TIGER-BILL <59732804+NIK-TIGER-BILL@users.noreply.github.com>
---
 .../components/evaluators/context_relevance.py     | 14 +++++++++++---
 haystack/components/evaluators/faithfulness.py     | 14 +++++++++++---
 .../notes/fix-evaluator-nan-scores-abc123.yaml     |  3 +++
 .../evaluators/test_context_relevance_evaluator.py |  9 ++++++---
 .../evaluators/test_faithfulness_evaluator.py      |  9 ++++++---
 5 files changed, 37 insertions(+), 12 deletions(-)
 create mode 100644 releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml

diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
index f5db0655e4..7830d847ca 100644
--- a/haystack/components/evaluators/context_relevance.py
+++ b/haystack/components/evaluators/context_relevance.py
@@ -2,15 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import math
 from statistics import mean
 from typing import Any
 
-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.components.evaluators.llm_evaluator import LLMEvaluator
 from haystack.components.generators.chat.types import ChatGenerator
 from haystack.core.serialization import component_to_dict
 from haystack.utils import deserialize_chatgenerator_inplace
 
+logger = logging.getLogger(__name__)
+
 # Private global variable for default examples to include in the prompt if the user does not provide any examples
 _DEFAULT_EXAMPLES = [
     {
@@ -182,8 +185,13 @@ def run(self, **inputs: Any) -> dict[str, Any]:
                 res["score"] = 0
 
         # calculate average context relevance score over all queries
-        result["score"] = mean([res["score"] for res in result["results"]])
-        result["individual_scores"] = [res["score"] for res in result["results"]]  # useful for the EvaluationRunResult
+        scores = [res["score"] for res in result["results"]]
+        valid_scores = [s for s in scores if not math.isnan(s)]
+        skipped = len(scores) - len(valid_scores)
+        if skipped:
+            logger.warning("%s query(s) failed and were excluded from the score.", skipped)
+        result["score"] = mean(valid_scores) if valid_scores else float("nan")
+        result["individual_scores"] = scores  # useful for the EvaluationRunResult
 
         return result
 
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
index 99b622160f..2fe0d005a8 100644
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@@ -2,16 +2,19 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import math
 from typing import Any
 
 from numpy import mean as np_mean
 
-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.components.evaluators.llm_evaluator import LLMEvaluator
 from haystack.components.generators.chat.types import ChatGenerator
 from haystack.core.serialization import component_to_dict
 from haystack.utils import deserialize_chatgenerator_inplace
 
+logger = logging.getLogger(__name__)
+
 # Default examples to include in the prompt if the user does not provide any examples
 _DEFAULT_EXAMPLES = [
     {
@@ -176,8 +179,13 @@ def run(self, **inputs: Any) -> dict[str, Any]:
                 res["score"] = np_mean(res["statement_scores"])
 
         # calculate average answer faithfulness score over all queries
-        result["score"] = np_mean([res["score"] for res in result["results"]])
-        result["individual_scores"] = [res["score"] for res in result["results"]]
+        scores = [res["score"] for res in result["results"]]
+        valid_scores = [s for s in scores if not math.isnan(s)]
+        skipped = len(scores) - len(valid_scores)
+        if skipped:
+            logger.warning("%s query(s) failed and were excluded from the score.", skipped)
+        result["score"] = np_mean(valid_scores) if valid_scores else float("nan")
+        result["individual_scores"] = scores
 
         return result
 
diff --git a/releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml b/releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml
new file mode 100644
index 0000000000..09292ba3d1
--- /dev/null
+++ b/releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml
@@ -0,0 +1,3 @@
+fixes:
+  - |
+    Fixed ``FaithfulnessEvaluator`` and ``ContextRelevanceEvaluator`` to exclude failed queries (score ``nan``) from the aggregate mean score instead of silently returning ``nan``. A warning is now logged when one or more queries are skipped.
diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py
index 9eca698879..ae145c0779 100644
--- a/test/components/evaluators/test_context_relevance_evaluator.py
+++ b/test/components/evaluators/test_context_relevance_evaluator.py
@@ -206,7 +206,7 @@ def test_run_missing_parameters(self, monkeypatch):
         with pytest.raises(ValueError, match="LLM evaluator expected input parameter"):
             component.run()
 
-    def test_run_returns_nan_raise_on_failure_false(self, monkeypatch):
+    def test_run_returns_nan_raise_on_failure_false(self, monkeypatch, caplog):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = ContextRelevanceEvaluator(raise_on_failure=False)
 
@@ -231,13 +231,16 @@ def chat_generator_run(self, *args, **kwargs):
                 "programmers write clear, logical code for both small and large-scale software projects."
             ],
         ]
-        results = component.run(questions=questions, contexts=contexts)
+        with caplog.at_level("WARNING", logger="haystack.components.evaluators.context_relevance"):
+            results = component.run(questions=questions, contexts=contexts)
 
-        assert math.isnan(results["score"])
+        assert results["score"] == 1
         assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1}
         assert results["results"][1]["relevant_statements"] == []
         assert math.isnan(results["results"][1]["score"])
 
+        assert "1 query(s) failed and were excluded from the score." in caplog.text
+
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py
index 64d113462a..4ae555a12a 100644
--- a/test/components/evaluators/test_faithfulness_evaluator.py
+++ b/test/components/evaluators/test_faithfulness_evaluator.py
@@ -253,7 +253,7 @@ def test_run_missing_parameters(self, monkeypatch):
         with pytest.raises(ValueError, match="LLM evaluator expected input parameter"):
             component.run()
 
-    def test_run_returns_nan_raise_on_failure_false(self, monkeypatch):
+    def test_run_returns_nan_raise_on_failure_false(self, monkeypatch, caplog):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = FaithfulnessEvaluator(raise_on_failure=False)
 
@@ -282,9 +282,10 @@ def chat_generator_run(self, *args, **kwargs):
             "Football is the most popular sport with around 4 billion followers worldwide.",
             "Guido van Rossum.",
         ]
-        results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
+        with caplog.at_level("WARNING", logger="haystack.components.evaluators.faithfulness"):
+            results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
 
-        assert math.isnan(results["score"])
+        assert results["score"] == 1.0
 
         assert results["individual_scores"][0] == 1.0
         assert math.isnan(results["individual_scores"][1])
@@ -295,6 +296,8 @@ def chat_generator_run(self, *args, **kwargs):
         assert results["results"][1]["statement_scores"] == []
         assert math.isnan(results["results"][1]["score"])
 
+        assert "1 query(s) failed and were excluded from the score." in caplog.text
+
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",

From 17163fccfa8de8eaeff04a29e478aacc4fc8ba49 Mon Sep 17 00:00:00 2001
From: Julian Risch <julian.risch@deepset.ai>
Date: Fri, 5 Jun 2026 15:58:54 +0200
Subject: [PATCH 2/2] fix: use Haystack logging convention for skipped-query
 warning

The warning for excluded NaN scores in FaithfulnessEvaluator and
ContextRelevanceEvaluator used %s positional formatting, which
Haystack's keyword-only logger rejects with a TypeError, crashing
run() in exactly the failed-query path this change is meant to handle.
Switch to {}-style interpolation with a keyword argument.

Also give the release note a proper reno hash filename and add the
missing YAML document start marker.
---
 haystack/components/evaluators/context_relevance.py             | 2 +-
 haystack/components/evaluators/faithfulness.py                  | 2 +-
 ...c123.yaml => fix-evaluator-nan-scores-ba6953344443c788.yaml} | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)
 rename releasenotes/notes/{fix-evaluator-nan-scores-abc123.yaml => fix-evaluator-nan-scores-ba6953344443c788.yaml} (98%)

diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py
index 7830d847ca..158d108d45 100644
--- a/haystack/components/evaluators/context_relevance.py
+++ b/haystack/components/evaluators/context_relevance.py
@@ -189,7 +189,7 @@ def run(self, **inputs: Any) -> dict[str, Any]:
         valid_scores = [s for s in scores if not math.isnan(s)]
         skipped = len(scores) - len(valid_scores)
         if skipped:
-            logger.warning("%s query(s) failed and were excluded from the score.", skipped)
+            logger.warning("{skipped} query(s) failed and were excluded from the score.", skipped=skipped)
         result["score"] = mean(valid_scores) if valid_scores else float("nan")
         result["individual_scores"] = scores  # useful for the EvaluationRunResult
 
diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py
index 2fe0d005a8..f3328569af 100644
--- a/haystack/components/evaluators/faithfulness.py
+++ b/haystack/components/evaluators/faithfulness.py
@@ -183,7 +183,7 @@ def run(self, **inputs: Any) -> dict[str, Any]:
         valid_scores = [s for s in scores if not math.isnan(s)]
         skipped = len(scores) - len(valid_scores)
         if skipped:
-            logger.warning("%s query(s) failed and were excluded from the score.", skipped)
+            logger.warning("{skipped} query(s) failed and were excluded from the score.", skipped=skipped)
         result["score"] = np_mean(valid_scores) if valid_scores else float("nan")
         result["individual_scores"] = scores
 
diff --git a/releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml b/releasenotes/notes/fix-evaluator-nan-scores-ba6953344443c788.yaml
similarity index 98%
rename from releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml
rename to releasenotes/notes/fix-evaluator-nan-scores-ba6953344443c788.yaml
index 09292ba3d1..9a88c0c0c1 100644
--- a/releasenotes/notes/fix-evaluator-nan-scores-abc123.yaml
+++ b/releasenotes/notes/fix-evaluator-nan-scores-ba6953344443c788.yaml
@@ -1,3 +1,4 @@
+---
 fixes:
   - |
     Fixed ``FaithfulnessEvaluator`` and ``ContextRelevanceEvaluator`` to exclude failed queries (score ``nan``) from the aggregate mean score instead of silently returning ``nan``. A warning is now logged when one or more queries are skipped.