Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 11 additions & 3 deletions haystack/components/evaluators/context_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,18 @@
#
# SPDX-License-Identifier: Apache-2.0

import math
from statistics import mean
from typing import Any

from haystack import component, default_from_dict, default_to_dict
from haystack import component, default_from_dict, default_to_dict, logging
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
from haystack.components.generators.chat.types import ChatGenerator
from haystack.core.serialization import component_to_dict
from haystack.utils import deserialize_chatgenerator_inplace

logger = logging.getLogger(__name__)

# Private global variable for default examples to include in the prompt if the user does not provide any examples
_DEFAULT_EXAMPLES = [
{
Expand Down Expand Up @@ -182,8 +185,13 @@ def run(self, **inputs: Any) -> dict[str, Any]:
res["score"] = 0

# calculate average context relevance score over all queries
result["score"] = mean([res["score"] for res in result["results"]])
result["individual_scores"] = [res["score"] for res in result["results"]] # useful for the EvaluationRunResult
scores = [res["score"] for res in result["results"]]
valid_scores = [s for s in scores if not math.isnan(s)]
skipped = len(scores) - len(valid_scores)
if skipped:
logger.warning("{skipped} query(s) failed and were excluded from the score.", skipped=skipped)
result["score"] = mean(valid_scores) if valid_scores else float("nan")
result["individual_scores"] = scores # useful for the EvaluationRunResult

return result

Expand Down
14 changes: 11 additions & 3 deletions haystack/components/evaluators/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,19 @@
#
# SPDX-License-Identifier: Apache-2.0

import math
from typing import Any

from numpy import mean as np_mean

from haystack import component, default_from_dict, default_to_dict
from haystack import component, default_from_dict, default_to_dict, logging
from haystack.components.evaluators.llm_evaluator import LLMEvaluator
from haystack.components.generators.chat.types import ChatGenerator
from haystack.core.serialization import component_to_dict
from haystack.utils import deserialize_chatgenerator_inplace

logger = logging.getLogger(__name__)

# Default examples to include in the prompt if the user does not provide any examples
_DEFAULT_EXAMPLES = [
{
Expand Down Expand Up @@ -176,8 +179,13 @@ def run(self, **inputs: Any) -> dict[str, Any]:
res["score"] = np_mean(res["statement_scores"])

# calculate average answer faithfulness score over all queries
result["score"] = np_mean([res["score"] for res in result["results"]])
result["individual_scores"] = [res["score"] for res in result["results"]]
scores = [res["score"] for res in result["results"]]
valid_scores = [s for s in scores if not math.isnan(s)]
skipped = len(scores) - len(valid_scores)
if skipped:
logger.warning("{skipped} query(s) failed and were excluded from the score.", skipped=skipped)
result["score"] = np_mean(valid_scores) if valid_scores else float("nan")
result["individual_scores"] = scores

return result

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
fixes:
- |
Fixed ``FaithfulnessEvaluator`` and ``ContextRelevanceEvaluator`` to exclude failed queries (score ``nan``) from the aggregate mean score instead of silently returning ``nan``. A warning is now logged when one or more queries are skipped.
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ def test_run_missing_parameters(self, monkeypatch):
with pytest.raises(ValueError, match="LLM evaluator expected input parameter"):
component.run()

def test_run_returns_nan_raise_on_failure_false(self, monkeypatch):
def test_run_returns_nan_raise_on_failure_false(self, monkeypatch, caplog):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = ContextRelevanceEvaluator(raise_on_failure=False)

Expand All @@ -231,13 +231,16 @@ def chat_generator_run(self, *args, **kwargs):
"programmers write clear, logical code for both small and large-scale software projects."
],
]
results = component.run(questions=questions, contexts=contexts)
with caplog.at_level("WARNING", logger="haystack.components.evaluators.context_relevance"):
results = component.run(questions=questions, contexts=contexts)

assert math.isnan(results["score"])
assert results["score"] == 1
assert results["results"][0] == {"relevant_statements": ["c", "d"], "score": 1}
assert results["results"][1]["relevant_statements"] == []
assert math.isnan(results["results"][1]["score"])

assert "1 query(s) failed and were excluded from the score." in caplog.text

@pytest.mark.skipif(
not os.environ.get("OPENAI_API_KEY", None),
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
Expand Down
9 changes: 6 additions & 3 deletions test/components/evaluators/test_faithfulness_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ def test_run_missing_parameters(self, monkeypatch):
with pytest.raises(ValueError, match="LLM evaluator expected input parameter"):
component.run()

def test_run_returns_nan_raise_on_failure_false(self, monkeypatch):
def test_run_returns_nan_raise_on_failure_false(self, monkeypatch, caplog):
monkeypatch.setenv("OPENAI_API_KEY", "test-api-key")
component = FaithfulnessEvaluator(raise_on_failure=False)

Expand Down Expand Up @@ -282,9 +282,10 @@ def chat_generator_run(self, *args, **kwargs):
"Football is the most popular sport with around 4 billion followers worldwide.",
"Guido van Rossum.",
]
results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
with caplog.at_level("WARNING", logger="haystack.components.evaluators.faithfulness"):
results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)

assert math.isnan(results["score"])
assert results["score"] == 1.0

assert results["individual_scores"][0] == 1.0
assert math.isnan(results["individual_scores"][1])
Expand All @@ -295,6 +296,8 @@ def chat_generator_run(self, *args, **kwargs):
assert results["results"][1]["statement_scores"] == []
assert math.isnan(results["results"][1]["score"])

assert "1 query(s) failed and were excluded from the score." in caplog.text

@pytest.mark.skipif(
not os.environ.get("OPENAI_API_KEY", None),
reason="Export an env var called OPENAI_API_KEY containing the OpenAI API key to run this test.",
Expand Down
Loading