diff --git a/haystack/components/classifiers/zero_shot_document_classifier.py b/haystack/components/classifiers/zero_shot_document_classifier.py index d4a9884672..40db6a6d68 100644 --- a/haystack/components/classifiers/zero_shot_document_classifier.py +++ b/haystack/components/classifiers/zero_shot_document_classifier.py @@ -159,6 +159,8 @@ def to_dict(self) -> dict[str, Any]: self, labels=self.labels, model=self.huggingface_pipeline_kwargs["model"], + classification_field=self.classification_field, + multi_label=self.multi_label, huggingface_pipeline_kwargs=self.huggingface_pipeline_kwargs, token=self.token, multi_label=self.multi_label, diff --git a/haystack/components/joiners/document_joiner.py b/haystack/components/joiners/document_joiner.py index 8a77b8d3d0..cca948571d 100644 --- a/haystack/components/joiners/document_joiner.py +++ b/haystack/components/joiners/document_joiner.py @@ -171,7 +171,7 @@ def _concatenate(document_lists: list[list[Document]]) -> list[Document]: for doc in itertools.chain.from_iterable(document_lists): docs_per_id[doc.id].append(doc) for docs in docs_per_id.values(): - doc_with_best_score = max(docs, key=lambda doc: doc.score if doc.score else -inf) + doc_with_best_score = max(docs, key=lambda doc: doc.score if doc.score is not None else -inf) output.append(doc_with_best_score) return output diff --git a/haystack/utils/requests_utils.py b/haystack/utils/requests_utils.py index c5270cc055..bbc04f14a8 100644 --- a/haystack/utils/requests_utils.py +++ b/haystack/utils/requests_utils.py @@ -67,9 +67,10 @@ def request_with_retry( after=after_log(logger, logging.DEBUG), ) def run() -> httpx.Response: - timeout = kwargs.pop("timeout", 10) + request_kwargs = dict(kwargs) + timeout = request_kwargs.pop("timeout", 10) with httpx.Client() as client: - res = client.request(**kwargs, timeout=timeout) + res = client.request(**request_kwargs, timeout=timeout) if res.status_code in status_codes_to_retry: # We raise only for the status codes that must trigger a retry @@ -177,9 +178,10 @@ async def example_5xx(): after=after_log(logger, logging.DEBUG), ) async def run() -> httpx.Response: - timeout = kwargs.pop("timeout", 10) + request_kwargs = dict(kwargs) + timeout = request_kwargs.pop("timeout", 10) async with httpx.AsyncClient() as client: - res = await client.request(**kwargs, timeout=timeout) + res = await client.request(**request_kwargs, timeout=timeout) if res.status_code in status_codes_to_retry: # We raise only for the status codes that must trigger a retry diff --git a/releasenotes/notes/Fix-DocumentJoiner-concatenate-mode-treating-score-0.0-as-missing-784af68479fb54bc.yaml b/releasenotes/notes/Fix-DocumentJoiner-concatenate-mode-treating-score-0.0-as-missing-784af68479fb54bc.yaml new file mode 100644 index 0000000000..e6bf19ced9 --- /dev/null +++ b/releasenotes/notes/Fix-DocumentJoiner-concatenate-mode-treating-score-0.0-as-missing-784af68479fb54bc.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Fixed `DocumentJoiner` in `concatenate` mode treating documents with a score of `0.0` as unscored when deduplicating by ID. + Duplicate documents with a zero score could lose to documents with negative or missing scores. diff --git a/releasenotes/notes/Fix-TransformersZeroShotDocumentClassifier-serialization-of-classification_field-and-multi_label-441a88b5195b8a8c.yaml b/releasenotes/notes/Fix-TransformersZeroShotDocumentClassifier-serialization-of-classification_field-and-multi_label-441a88b5195b8a8c.yaml new file mode 100644 index 0000000000..1e47c9c927 --- /dev/null +++ b/releasenotes/notes/Fix-TransformersZeroShotDocumentClassifier-serialization-of-classification_field-and-multi_label-441a88b5195b8a8c.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + `TransformersZeroShotDocumentClassifier.to_dict()` now serializes `classification_field` and `multi_label`, so pipeline dump/load round-trips preserve the configured classification behavior. diff --git a/releasenotes/notes/preserve-timeout-across-retries-a590b478d89d435b.yaml b/releasenotes/notes/preserve-timeout-across-retries-a590b478d89d435b.yaml new file mode 100644 index 0000000000..6e91fff9dc --- /dev/null +++ b/releasenotes/notes/preserve-timeout-across-retries-a590b478d89d435b.yaml @@ -0,0 +1,5 @@ +--- +fixes: + - | + Preserve user-specified timeouts across retries in `request_with_retry` and + `async_request_with_retry`. diff --git a/test/components/classifiers/test_zero_shot_document_classifier.py b/test/components/classifiers/test_zero_shot_document_classifier.py index 5c7a8fb831..6857143872 100644 --- a/test/components/classifiers/test_zero_shot_document_classifier.py +++ b/test/components/classifiers/test_zero_shot_document_classifier.py @@ -36,6 +36,8 @@ def test_to_dict(self): "init_parameters": { "model": "cross-encoder/nli-deberta-v3-xsmall", "labels": ["positive", "negative"], + "classification_field": None, + "multi_label": False, "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, "multi_label": True, "classification_field": "title", @@ -47,6 +49,17 @@ def test_to_dict(self): }, } + def test_to_dict_from_dict_round_trip(self): + component = TransformersZeroShotDocumentClassifier( + model="cross-encoder/nli-deberta-v3-xsmall", + labels=["a", "b"], + classification_field="title", + multi_label=True, + ) + restored = TransformersZeroShotDocumentClassifier.from_dict(component.to_dict()) + assert restored.classification_field == "title" + assert restored.multi_label is True + def test_from_dict(self, del_hf_env_vars): data = { "type": "haystack.components.classifiers.zero_shot_document_classifier.TransformersZeroShotDocumentClassifier", # noqa: E501 diff --git a/test/components/joiners/test_document_joiner.py b/test/components/joiners/test_document_joiner.py index 75abf2450d..ab62f03e2c 100644 --- a/test/components/joiners/test_document_joiner.py +++ b/test/components/joiners/test_document_joiner.py @@ -146,6 +146,17 @@ def test_run_with_concatenate_join_mode_and_duplicate_documents(self): output["documents"], key=lambda d: d.id ) + def test_concatenate_keeps_highest_score_for_zero_and_negative_scores(self): + joiner = DocumentJoiner(sort_by_score=False) + documents_1 = [Document(id="dup", content="no score")] + documents_2 = [Document(id="dup", content="zero score", score=0.0)] + documents_3 = [Document(id="dup", content="negative score", score=-0.1)] + + output = joiner.run([documents_1, documents_2, documents_3]) + assert len(output["documents"]) == 1 + assert output["documents"][0].content == "zero score" + assert output["documents"][0].score == 0.0 + def test_run_with_merge_join_mode(self): joiner = DocumentJoiner(join_mode="merge", weights=[1.5, 0.5]) documents_1 = [Document(content="a", score=1.0), Document(content="b", score=2.0)]