deepset-ai · NIK-TIGER-BILL · May 23, 2026 · Jun 5, 2026
@@ -2,15 +2,18 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import math
 from statistics import mean
 from typing import Any
 
-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.components.evaluators.llm_evaluator import LLMEvaluator
 from haystack.components.generators.chat.types import ChatGenerator
 from haystack.core.serialization import component_to_dict
 from haystack.utils import deserialize_chatgenerator_inplace
 
+logger = logging.getLogger(__name__)
+
 # Private global variable for default examples to include in the prompt if the user does not provide any examples
 _DEFAULT_EXAMPLES = [
     {
@@ -182,8 +185,13 @@ def run(self, **inputs: Any) -> dict[str, Any]:
                 res["score"] = 0
 
         # calculate average context relevance score over all queries
-        result["score"] = mean([res["score"] for res in result["results"]])
-        result["individual_scores"] = [res["score"] for res in result["results"]]  # useful for the EvaluationRunResult
+        scores = [res["score"] for res in result["results"]]
+        valid_scores = [s for s in scores if not math.isnan(s)]
+        skipped = len(scores) - len(valid_scores)
+        if skipped:
+            logger.warning("{skipped} query(s) failed and were excluded from the score.", skipped=skipped)
+        result["score"] = mean(valid_scores) if valid_scores else float("nan")
+        result["individual_scores"] = scores  # useful for the EvaluationRunResult
 
         return result
 

@@ -2,16 +2,19 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import math
 from typing import Any
 
 from numpy import mean as np_mean
 
-from haystack import component, default_from_dict, default_to_dict
+from haystack import component, default_from_dict, default_to_dict, logging
 from haystack.components.evaluators.llm_evaluator import LLMEvaluator
 from haystack.components.generators.chat.types import ChatGenerator
 from haystack.core.serialization import component_to_dict
 from haystack.utils import deserialize_chatgenerator_inplace
 
+logger = logging.getLogger(__name__)
+
 # Default examples to include in the prompt if the user does not provide any examples
 _DEFAULT_EXAMPLES = [
     {
@@ -176,8 +179,13 @@ def run(self, **inputs: Any) -> dict[str, Any]:
                 res["score"] = np_mean(res["statement_scores"])
 
         # calculate average answer faithfulness score over all queries
-        result["score"] = np_mean([res["score"] for res in result["results"]])
-        result["individual_scores"] = [res["score"] for res in result["results"]]
+        scores = [res["score"] for res in result["results"]]
+        valid_scores = [s for s in scores if not math.isnan(s)]
+        skipped = len(scores) - len(valid_scores)
+        if skipped:
+            logger.warning("{skipped} query(s) failed and were excluded from the score.", skipped=skipped)
+        result["score"] = np_mean(valid_scores) if valid_scores else float("nan")
+        result["individual_scores"] = scores
 
         return result
 

@@ -0,0 +1,4 @@
+---
+fixes:
+  - |
+    Fixed ``FaithfulnessEvaluator`` and ``ContextRelevanceEvaluator`` to exclude failed queries (score ``nan``) from the aggregate mean score instead of silently returning ``nan``. A warning is now logged when one or more queries are skipped.
@@ -206,7 +206,7 @@ def test_run_missing_parameters(self, monkeypatch):
         with pytest.raises(ValueError, match="LLM evaluator expected input parameter"):
             component.run()
 
-    def test_run_returns_nan_raise_on_failure_false(self, monkeypatch):
+    def test_run_returns_nan_raise_on_failure_false(self, monkeypatch, caplog):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = ContextRelevanceEvaluator(raise_on_failure=False)
 
@@ -231,13 +231,16 @@ def chat_generator_run(self, *args, **kwargs):
                 "programmers write clear, logical code for both small and large-scale software projects."
             ],
         ]
-        results = component.run(questions=questions, contexts=contexts)
+        with caplog.at_level("WARNING", logger="haystack.components.evaluators.context_relevance"):
+            results = component.run(questions=questions, contexts=contexts)
 
-        assert math.isnan(results["score"])
+        assert results["score"] == 1
         assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1}
         assert results["results"][1]["relevant_statements"] == []
         assert math.isnan(results["results"][1]["score"])
 
+        assert "1 query(s) failed and were excluded from the score." in caplog.text
+
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",

@@ -253,7 +253,7 @@ def test_run_missing_parameters(self, monkeypatch):
         with pytest.raises(ValueError, match="LLM evaluator expected input parameter"):
             component.run()
 
-    def test_run_returns_nan_raise_on_failure_false(self, monkeypatch):
+    def test_run_returns_nan_raise_on_failure_false(self, monkeypatch, caplog):
         monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
         component = FaithfulnessEvaluator(raise_on_failure=False)
 
@@ -282,9 +282,10 @@ def chat_generator_run(self, *args, **kwargs):
             "Football is the most popular sport with around 4 billion followers worldwide.",
             "Guido van Rossum.",
         ]
-        results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
+        with caplog.at_level("WARNING", logger="haystack.components.evaluators.faithfulness"):
+            results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
 
-        assert math.isnan(results["score"])
+        assert results["score"] == 1.0
 
         assert results["individual_scores"][0] == 1.0
         assert math.isnan(results["individual_scores"][1])
@@ -295,6 +296,8 @@ def chat_generator_run(self, *args, **kwargs):
         assert results["results"][1]["statement_scores"] == []
         assert math.isnan(results["results"][1]["score"])
 
+        assert "1 query(s) failed and were excluded from the score." in caplog.text
+
     @pytest.mark.skipif(
         not os.environ.get("OPENAI_API_KEY", None),
         reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",