diff --git a/haystack/components/evaluators/context_relevance.py b/haystack/components/evaluators/context_relevance.py index f5db0655e4..158d108d45 100644 --- a/haystack/components/evaluators/context_relevance.py +++ b/haystack/components/evaluators/context_relevance.py @@ -2,15 +2,18 @@ # # SPDX-License-Identifier: Apache-2.0 +import math from statistics import mean from typing import Any -from haystack import component, default_from_dict, default_to_dict +from haystack import component, default_from_dict, default_to_dict, logging from haystack.components.evaluators.llm_evaluator import LLMEvaluator from haystack.components.generators.chat.types import ChatGenerator from haystack.core.serialization import component_to_dict from haystack.utils import deserialize_chatgenerator_inplace +logger = logging.getLogger(__name__) + # Private global variable for default examples to include in the prompt if the user does not provide any examples _DEFAULT_EXAMPLES = [ { @@ -182,8 +185,13 @@ def run(self, **inputs: Any) -> dict[str, Any]: res["score"] = 0 # calculate average context relevance score over all queries - result["score"] = mean([res["score"] for res in result["results"]]) - result["individual_scores"] = [res["score"] for res in result["results"]] # useful for the EvaluationRunResult + scores = [res["score"] for res in result["results"]] + valid_scores = [s for s in scores if not math.isnan(s)] + skipped = len(scores) - len(valid_scores) + if skipped: + logger.warning("{skipped} query(s) failed and were excluded from the score.", skipped=skipped) + result["score"] = mean(valid_scores) if valid_scores else float("nan") + result["individual_scores"] = scores # useful for the EvaluationRunResult return result diff --git a/haystack/components/evaluators/faithfulness.py b/haystack/components/evaluators/faithfulness.py index 456788c1aa..38acc4db75 100644 --- a/haystack/components/evaluators/faithfulness.py +++ b/haystack/components/evaluators/faithfulness.py @@ -2,16 +2,19 @@ # # SPDX-License-Identifier: Apache-2.0 +import math from typing import Any from numpy import mean as np_mean -from haystack import component, default_from_dict, default_to_dict +from haystack import component, default_from_dict, default_to_dict, logging from haystack.components.evaluators.llm_evaluator import LLMEvaluator from haystack.components.generators.chat.types import ChatGenerator from haystack.core.serialization import component_to_dict from haystack.utils import deserialize_chatgenerator_inplace +logger = logging.getLogger(__name__) + # Default examples to include in the prompt if the user does not provide any examples _DEFAULT_EXAMPLES = [ { @@ -176,8 +179,13 @@ def run(self, **inputs: Any) -> dict[str, Any]: res["score"] = np_mean(res["statement_scores"]) # calculate average answer faithfulness score over all queries - result["score"] = np_mean([res["score"] for res in result["results"]]) - result["individual_scores"] = [res["score"] for res in result["results"]] + scores = [res["score"] for res in result["results"]] + valid_scores = [s for s in scores if not math.isnan(s)] + skipped = len(scores) - len(valid_scores) + if skipped: + logger.warning("{skipped} query(s) failed and were excluded from the score.", skipped=skipped) + result["score"] = np_mean(valid_scores) if valid_scores else float("nan") + result["individual_scores"] = scores return result diff --git a/releasenotes/notes/fix-evaluator-nan-scores-ba6953344443c788.yaml b/releasenotes/notes/fix-evaluator-nan-scores-ba6953344443c788.yaml new file mode 100644 index 0000000000..9a88c0c0c1 --- /dev/null +++ b/releasenotes/notes/fix-evaluator-nan-scores-ba6953344443c788.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + Fixed ``FaithfulnessEvaluator`` and ``ContextRelevanceEvaluator`` to exclude failed queries (score ``nan``) from the aggregate mean score instead of silently returning ``nan``. A warning is now logged when one or more queries are skipped. diff --git a/test/components/evaluators/test_context_relevance_evaluator.py b/test/components/evaluators/test_context_relevance_evaluator.py index 9eca698879..ae145c0779 100644 --- a/test/components/evaluators/test_context_relevance_evaluator.py +++ b/test/components/evaluators/test_context_relevance_evaluator.py @@ -206,7 +206,7 @@ def test_run_missing_parameters(self, monkeypatch): with pytest.raises(ValueError, match="LLM evaluator expected input parameter"): component.run() - def test_run_returns_nan_raise_on_failure_false(self, monkeypatch): + def test_run_returns_nan_raise_on_failure_false(self, monkeypatch, caplog): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = ContextRelevanceEvaluator(raise_on_failure=False) @@ -231,13 +231,16 @@ def chat_generator_run(self, *args, **kwargs): "programmers write clear, logical code for both small and large-scale software projects." ], ] - results = component.run(questions=questions, contexts=contexts) + with caplog.at_level("WARNING", logger="haystack.components.evaluators.context_relevance"): + results = component.run(questions=questions, contexts=contexts) - assert math.isnan(results["score"]) + assert results["score"] == 1 assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1} assert results["results"][1]["relevant_statements"] == [] assert math.isnan(results["results"][1]["score"]) + assert "1 query(s) failed and were excluded from the score." in caplog.text + @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.", diff --git a/test/components/evaluators/test_faithfulness_evaluator.py b/test/components/evaluators/test_faithfulness_evaluator.py index 64d113462a..4ae555a12a 100644 --- a/test/components/evaluators/test_faithfulness_evaluator.py +++ b/test/components/evaluators/test_faithfulness_evaluator.py @@ -253,7 +253,7 @@ def test_run_missing_parameters(self, monkeypatch): with pytest.raises(ValueError, match="LLM evaluator expected input parameter"): component.run() - def test_run_returns_nan_raise_on_failure_false(self, monkeypatch): + def test_run_returns_nan_raise_on_failure_false(self, monkeypatch, caplog): monkeypatch.setenv("OPENAI_API_KEY", "test-api-key") component = FaithfulnessEvaluator(raise_on_failure=False) @@ -282,9 +282,10 @@ def chat_generator_run(self, *args, **kwargs): "Football is the most popular sport with around 4 billion followers worldwide.", "Guido van Rossum.", ] - results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) + with caplog.at_level("WARNING", logger="haystack.components.evaluators.faithfulness"): + results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers) - assert math.isnan(results["score"]) + assert results["score"] == 1.0 assert results["individual_scores"][0] == 1.0 assert math.isnan(results["individual_scores"][1]) @@ -295,6 +296,8 @@ def chat_generator_run(self, *args, **kwargs): assert results["results"][1]["statement_scores"] == [] assert math.isnan(results["results"][1]["score"]) + assert "1 query(s) failed and were excluded from the score." in caplog.text + @pytest.mark.skipif( not os.environ.get("OPENAI_API_KEY", None), reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",