diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index 690e41120c..61357ebd0a 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -60,8 +60,6 @@ jobs: - "haystack/components/audio/whisper_local.py" - "haystack/components/classifiers/zero_shot_document_classifier.py" - "haystack/components/converters/tika.py" - - "haystack/components/embedders/hugging_face_api_document_embedder.py" - - "haystack/components/embedders/hugging_face_api_text_embedder.py" - "haystack/components/embedders/backends/sentence_transformers_backend.py" - "haystack/components/embedders/backends/sentence_transformers_sparse_backend.py" - "haystack/components/embedders/image/sentence_transformers_doc_image_embedder.py" @@ -69,10 +67,7 @@ jobs: - "haystack/components/embedders/sentence_transformers_sparse_document_embedder.py" - "haystack/components/embedders/sentence_transformers_sparse_text_embedder.py" - "haystack/components/evaluators/sas_evaluator.py" - - "haystack/components/generators/chat/hugging_face_api.py" - "haystack/components/generators/chat/hugging_face_local.py" - - "haystack/components/generators/hugging_face_api.py" - - "haystack/components/generators/hugging_face_local_generator.py" - "haystack/components/generators/openai_dalle.py" - "haystack/components/preprocessors/embedding_based_document_splitter.py" - "haystack/components/rankers/sentence_transformers_diversity.py" @@ -86,17 +81,12 @@ jobs: - "test/components/audio/test_whisper_local.py" - "test/components/classifiers/test_zero_shot_document_classifier.py" - "test/components/converters/test_tika_doc_converter.py" - - "test/components/embedders/test_hugging_face_api_document_embedder.py" - - "test/components/embedders/test_hugging_face_api_text_embedder.py" - "test/components/embedders/image/test_sentence_transformers_doc_image_embedder.py" - "test/components/embedders/test_sentence_transformers_text_embedder.py" - "test/components/embedders/test_sentence_transformers_sparse_document_embedder.py" - "test/components/embedders/test_sentence_transformers_sparse_text_embedder.py" - "test/components/evaluators/test_sas_evaluator.py" - - "test/components/generators/chat/test_hugging_face_api.py" - "test/components/generators/chat/test_hugging_face_local.py" - - "test/components/generators/test_hugging_face_api.py" - - "test/components/generators/test_hugging_face_local_generator.py" - "test/components/generators/test_openai_dalle.py" - "test/components/preprocessors/test_embedding_based_document_splitter.py" - "test/components/rankers/test_sentence_transformers_diversity.py" diff --git a/MIGRATION.md b/MIGRATION.md index 697fda4614..21722c72e6 100644 --- a/MIGRATION.md +++ b/MIGRATION.md @@ -40,6 +40,8 @@ component = NewComponent(new_param="value") - **One entry per breaking change.** Don't bundle unrelated changes into a single entry. - **Include a working code example** for every rename, removal, or signature change. - **Link to the PR** when extra context would help (e.g. `See [#1234](https://github.com/deepset-ai/haystack/pull/1234)`). +- **Components moved to external packages** don't need a full entry: add a row to the table in + [Components Moved to External Packages](#components-moved-to-external-packages) instead. --- @@ -71,6 +73,27 @@ from haystack.dataclasses import Document doc = Document(content="col\n1\n2\n3") ``` +### Components Moved to External Packages + +**What changed:** Some components have been moved out of Haystack into dedicated integration packages, +hosted in the [haystack-core-integrations](https://github.com/deepset-ai/haystack-core-integrations) repository. + +**Why:** Moving these components to separate packages allows testing more thoroughly in isolation and +releasing fixes independently of the Haystack release cycle. This also makes Haystack development and CI leaner. + +**How to migrate:** Install the new package and update your imports as shown in the table below. + +```bash +pip install +``` + +| Old import (`haystack-ai<3.0.0`) | New package | New import | +|---|---|---| +| `from haystack.components.generators.chat import HuggingFaceAPIChatGenerator` | `huggingface-api-haystack` | `from haystack_integrations.components.generators.huggingface_api import HuggingFaceAPIChatGenerator` | +| `from haystack.components.embedders import HuggingFaceAPITextEmbedder` | `huggingface-api-haystack` | `from haystack_integrations.components.embedders.huggingface_api import HuggingFaceAPITextEmbedder` | +| `from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder` | `huggingface-api-haystack` | `from haystack_integrations.components.embedders.huggingface_api import HuggingFaceAPIDocumentEmbedder` | +| `from haystack.components.rankers import HuggingFaceTEIRanker` | `huggingface-api-haystack` | `from haystack_integrations.components.rankers.huggingface_api import HuggingFaceTEIRanker` | + ### ToolInvoker component removed **What changed:** The `ToolInvoker` component has been removed. Imports from `haystack.components.tools` diff --git a/haystack/components/embedders/__init__.py b/haystack/components/embedders/__init__.py index a6c92ca3b3..d15d5cf761 100644 --- a/haystack/components/embedders/__init__.py +++ b/haystack/components/embedders/__init__.py @@ -10,8 +10,6 @@ _import_structure = { "azure_document_embedder": ["AzureOpenAIDocumentEmbedder"], "azure_text_embedder": ["AzureOpenAITextEmbedder"], - "hugging_face_api_document_embedder": ["HuggingFaceAPIDocumentEmbedder"], - "hugging_face_api_text_embedder": ["HuggingFaceAPITextEmbedder"], "openai_document_embedder": ["OpenAIDocumentEmbedder"], "openai_text_embedder": ["OpenAITextEmbedder"], "sentence_transformers_document_embedder": ["SentenceTransformersDocumentEmbedder"], @@ -23,8 +21,6 @@ if TYPE_CHECKING: from .azure_document_embedder import AzureOpenAIDocumentEmbedder as AzureOpenAIDocumentEmbedder from .azure_text_embedder import AzureOpenAITextEmbedder as AzureOpenAITextEmbedder - from .hugging_face_api_document_embedder import HuggingFaceAPIDocumentEmbedder as HuggingFaceAPIDocumentEmbedder - from .hugging_face_api_text_embedder import HuggingFaceAPITextEmbedder as HuggingFaceAPITextEmbedder from .openai_document_embedder import OpenAIDocumentEmbedder as OpenAIDocumentEmbedder from .openai_text_embedder import OpenAITextEmbedder as OpenAITextEmbedder from .sentence_transformers_document_embedder import ( diff --git a/haystack/components/embedders/hugging_face_api_document_embedder.py b/haystack/components/embedders/hugging_face_api_document_embedder.py deleted file mode 100644 index 0b2951f8ed..0000000000 --- a/haystack/components/embedders/hugging_face_api_document_embedder.py +++ /dev/null @@ -1,378 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -from asyncio import Semaphore, gather -from dataclasses import replace -from itertools import chain -from typing import Any - -from tqdm import tqdm - -from haystack import component, default_from_dict, default_to_dict, logging -from haystack.dataclasses import Document -from haystack.lazy_imports import LazyImport -from haystack.utils import Secret -from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model -from haystack.utils.url_validation import is_valid_http_url - -with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import: - from huggingface_hub import AsyncInferenceClient, InferenceClient - -logger = logging.getLogger(__name__) - - -@component -class HuggingFaceAPIDocumentEmbedder: - """ - Embeds documents using Hugging Face APIs. - - Use it with the following Hugging Face APIs: - - [Free Serverless Inference API](https://huggingface.co/inference-api) - - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints) - - [Self-hosted Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference) - - - ### Usage examples - - #### With free serverless inference API - - ```python - from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder - from haystack.utils import Secret - from haystack.dataclasses import Document - - doc = Document(content="I love pizza!") - - doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="serverless_inference_api", - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("")) - - result = document_embedder.run([doc]) - print(result["documents"][0].embedding) - - # [0.017020374536514282, -0.023255806416273117, ...] - ``` - - #### With paid inference endpoints - - ```python - from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder - from haystack.utils import Secret - from haystack.dataclasses import Document - - doc = Document(content="I love pizza!") - - doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="inference_endpoints", - api_params={"url": ""}, - token=Secret.from_token("")) - - result = document_embedder.run([doc]) - print(result["documents"][0].embedding) - - # [0.017020374536514282, -0.023255806416273117, ...] - ``` - - #### With self-hosted text embeddings inference - - ```python - from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder - from haystack.dataclasses import Document - - doc = Document(content="I love pizza!") - - doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="text_embeddings_inference", - api_params={"url": "http://localhost:8080"}) - - result = document_embedder.run([doc]) - print(result["documents"][0].embedding) - - # [0.017020374536514282, -0.023255806416273117, ...] - ``` - """ - - def __init__( - self, - api_type: HFEmbeddingAPIType | str, - api_params: dict[str, str], - token: Secret | None = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False), - prefix: str = "", - suffix: str = "", - truncate: bool | None = True, - normalize: bool | None = False, - batch_size: int = 32, - progress_bar: bool = True, - meta_fields_to_embed: list[str] | None = None, - embedding_separator: str = "\n", - concurrency_limit: int = 4, - ) -> None: - """ - Creates a HuggingFaceAPIDocumentEmbedder component. - - :param api_type: - The type of Hugging Face API to use. - :param api_params: - A dictionary with the following keys: - - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`. - - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or - `TEXT_EMBEDDINGS_INFERENCE`. - :param token: The Hugging Face token to use as HTTP bearer authorization. - Check your HF token in your [account settings](https://huggingface.co/settings/tokens). - :param prefix: - A string to add at the beginning of each text. - :param suffix: - A string to add at the end of each text. - :param truncate: - Truncates the input text to the maximum length supported by the model. - Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS` - if the backend uses Text Embeddings Inference. - If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored. - :param normalize: - Normalizes the embeddings to unit length. - Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS` - if the backend uses Text Embeddings Inference. - If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored. - :param batch_size: - Number of documents to process at once. - :param progress_bar: - If `True`, shows a progress bar when running. - :param meta_fields_to_embed: - List of metadata fields to embed along with the document text. - :param embedding_separator: - Separator used to concatenate the metadata fields to the document text. - :param concurrency_limit: - The maximum number of requests that should be allowed to run concurrently. - This parameter is only used in the `run_async` method. - """ - huggingface_hub_import.check() - - if isinstance(api_type, str): - api_type = HFEmbeddingAPIType.from_str(api_type) - - api_params = api_params or {} - - if api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API: - model = api_params.get("model") - if model is None: - raise ValueError( - "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`." - ) - check_valid_model(model, HFModelType.EMBEDDING, token) - model_or_url = model - elif api_type in [HFEmbeddingAPIType.INFERENCE_ENDPOINTS, HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE]: - url = api_params.get("url") - if url is None: - msg = ( - "To use Text Embeddings Inference or Inference Endpoints, you need to specify the `url` " - "parameter in `api_params`." - ) - raise ValueError(msg) - if not is_valid_http_url(url): - raise ValueError(f"Invalid URL: {url}") - model_or_url = url - else: - msg = f"Unknown api_type {api_type}" - raise ValueError(msg) - - client_args: dict[str, Any] = {"model": model_or_url, "token": token.resolve_value() if token else None} - - self.api_type = api_type - self.api_params = api_params - self.token = token - self.prefix = prefix - self.suffix = suffix - self.truncate = truncate - self.normalize = normalize - self.batch_size = batch_size - self.progress_bar = progress_bar - self.meta_fields_to_embed = meta_fields_to_embed or [] - self.embedding_separator = embedding_separator - self.concurrency_limit = concurrency_limit - self._client = InferenceClient(**client_args) - self._async_client = AsyncInferenceClient(**client_args) - - def to_dict(self) -> dict[str, Any]: - """ - Serializes the component to a dictionary. - - :returns: - Dictionary with serialized data. - """ - return default_to_dict( - self, - api_type=str(self.api_type), - api_params=self.api_params, - prefix=self.prefix, - suffix=self.suffix, - token=self.token, - truncate=self.truncate, - normalize=self.normalize, - batch_size=self.batch_size, - progress_bar=self.progress_bar, - meta_fields_to_embed=self.meta_fields_to_embed, - embedding_separator=self.embedding_separator, - concurrency_limit=self.concurrency_limit, - ) - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "HuggingFaceAPIDocumentEmbedder": - """ - Deserializes the component from a dictionary. - - :param data: - Dictionary to deserialize from. - :returns: - Deserialized component. - """ - return default_from_dict(cls, data) - - def _prepare_texts_to_embed(self, documents: list[Document]) -> list[str]: - """ - Prepare the texts to embed by concatenating the Document text with the metadata fields to embed. - """ - texts_to_embed = [] - for doc in documents: - meta_values_to_embed = [ - str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None - ] - - text_to_embed = ( - self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix - ) - - texts_to_embed.append(text_to_embed) - return texts_to_embed - - @staticmethod - def _adjust_api_parameters( - truncate: bool | None, normalize: bool | None, api_type: HFEmbeddingAPIType - ) -> tuple[bool | None, bool | None]: - """ - Adjust the truncate and normalize parameters based on the API type. - """ - if api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API: - if truncate is not None: - msg = "`truncate` parameter is not supported for Serverless Inference API. It will be ignored." - logger.warning(msg) - truncate = None - if normalize is not None: - msg = "`normalize` parameter is not supported for Serverless Inference API. It will be ignored." - logger.warning(msg) - normalize = None - return truncate, normalize - - def _embed_batch(self, texts_to_embed: list[str], batch_size: int) -> list[list[float]]: - """ - Embed a list of texts in batches. - """ - truncate, normalize = self._adjust_api_parameters(self.truncate, self.normalize, self.api_type) - - all_embeddings: list = [] - for i in tqdm( - range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings" - ): - batch = texts_to_embed[i : i + batch_size] - - np_embeddings = self._client.feature_extraction(text=batch, truncate=truncate, normalize=normalize) - - if np_embeddings.ndim != 2 or np_embeddings.shape[0] != len(batch): - raise ValueError(f"Expected embedding shape ({batch_size}, embedding_dim), got {np_embeddings.shape}") - - all_embeddings.extend(np_embeddings.tolist()) - - return all_embeddings - - async def _embed_batch_async(self, texts_to_embed: list[str], batch_size: int) -> list[list[float]]: - """ - Embed a list of texts in batches asynchronously. - """ - truncate, normalize = self._adjust_api_parameters(self.truncate, self.normalize, self.api_type) - sem = Semaphore(max(1, self.concurrency_limit)) - num_batches = (len(texts_to_embed) + batch_size - 1) // batch_size - pbar = tqdm(total=num_batches, disable=not self.progress_bar, desc="Calculating embeddings") - - async def _runner(batch: list[str]) -> list[list[float]]: - async with sem: - np_embeddings = await self._async_client.feature_extraction( - text=batch, truncate=truncate, normalize=normalize - ) - - if np_embeddings.ndim != 2 or np_embeddings.shape[0] != len(batch): - raise ValueError( - f"Expected embedding shape ({batch_size}, embedding_dim), got {np_embeddings.shape}" - ) - - pbar.update(1) - return np_embeddings.tolist() - - try: - all_embeddings = [ - *chain( - *await gather( - *[ - _runner(texts_to_embed[i : i + batch_size]) - for i in range(0, len(texts_to_embed), batch_size) - ] - ) - ) - ] - finally: - pbar.close() - - return all_embeddings - - @component.output_types(documents=list[Document]) - def run(self, documents: list[Document]) -> dict[str, list[Document]]: - """ - Embeds a list of documents. - - :param documents: - Documents to embed. - - :returns: - A dictionary with the following keys: - - `documents`: A list of documents with embeddings. - """ - if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): - raise TypeError( - "HuggingFaceAPIDocumentEmbedder expects a list of Documents as input." - " In case you want to embed a string, please use the HuggingFaceAPITextEmbedder." - ) - - texts_to_embed = self._prepare_texts_to_embed(documents=documents) - - embeddings = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size) - - new_documents = [] - for doc, emb in zip(documents, embeddings, strict=True): - new_documents.append(replace(doc, embedding=emb)) - - return {"documents": new_documents} - - @component.output_types(documents=list[Document]) - async def run_async(self, documents: list[Document]) -> dict[str, list[Document]]: - """ - Embeds a list of documents asynchronously. - - :param documents: - Documents to embed. - - :returns: - A dictionary with the following keys: - - `documents`: A list of documents with embeddings. - """ - if not isinstance(documents, list) or documents and not isinstance(documents[0], Document): - raise TypeError( - "HuggingFaceAPIDocumentEmbedder expects a list of Documents as input." - " In case you want to embed a string, please use the HuggingFaceAPITextEmbedder." - ) - - texts_to_embed = self._prepare_texts_to_embed(documents=documents) - - embeddings = await self._embed_batch_async(texts_to_embed=texts_to_embed, batch_size=self.batch_size) - - new_documents = [] - for doc, emb in zip(documents, embeddings, strict=True): - new_documents.append(replace(doc, embedding=emb)) - - return {"documents": new_documents} diff --git a/haystack/components/embedders/hugging_face_api_text_embedder.py b/haystack/components/embedders/hugging_face_api_text_embedder.py deleted file mode 100644 index 5eb45a8c73..0000000000 --- a/haystack/components/embedders/hugging_face_api_text_embedder.py +++ /dev/null @@ -1,258 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -from typing import Any - -from haystack import component, default_from_dict, default_to_dict, logging -from haystack.lazy_imports import LazyImport -from haystack.utils import Secret -from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model -from haystack.utils.url_validation import is_valid_http_url - -with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import: - from huggingface_hub import AsyncInferenceClient, InferenceClient - -logger = logging.getLogger(__name__) - - -@component -class HuggingFaceAPITextEmbedder: - """ - Embeds strings using Hugging Face APIs. - - Use it with the following Hugging Face APIs: - - [Free Serverless Inference API](https://huggingface.co/inference-api) - - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints) - - [Self-hosted Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference) - - ### Usage examples - - #### With free serverless inference API - - ```python - from haystack.components.embedders import HuggingFaceAPITextEmbedder - from haystack.utils import Secret - - text_embedder = HuggingFaceAPITextEmbedder(api_type="serverless_inference_api", - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("")) - - print(text_embedder.run("I love pizza!")) - - # {'embedding': [0.017020374536514282, -0.023255806416273117, ...], - ``` - - #### With paid inference endpoints - - ```python - from haystack.components.embedders import HuggingFaceAPITextEmbedder - from haystack.utils import Secret - text_embedder = HuggingFaceAPITextEmbedder(api_type="inference_endpoints", - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("")) - - print(text_embedder.run("I love pizza!")) - - # {'embedding': [0.017020374536514282, -0.023255806416273117, ...], - ``` - - #### With self-hosted text embeddings inference - - ```python - from haystack.components.embedders import HuggingFaceAPITextEmbedder - from haystack.utils import Secret - - text_embedder = HuggingFaceAPITextEmbedder(api_type="text_embeddings_inference", - api_params={"url": "http://localhost:8080"}) - - print(text_embedder.run("I love pizza!")) - - # {'embedding': [0.017020374536514282, -0.023255806416273117, ...], - ``` - """ - - def __init__( - self, - api_type: HFEmbeddingAPIType | str, - api_params: dict[str, str], - token: Secret | None = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False), - prefix: str = "", - suffix: str = "", - truncate: bool | None = True, - normalize: bool | None = False, - ) -> None: - """ - Creates a HuggingFaceAPITextEmbedder component. - - :param api_type: - The type of Hugging Face API to use. - :param api_params: - A dictionary with the following keys: - - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`. - - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or - `TEXT_EMBEDDINGS_INFERENCE`. - :param token: The Hugging Face token to use as HTTP bearer authorization. - Check your HF token in your [account settings](https://huggingface.co/settings/tokens). - :param prefix: - A string to add at the beginning of each text. - :param suffix: - A string to add at the end of each text. - :param truncate: - Truncates the input text to the maximum length supported by the model. - Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS` - if the backend uses Text Embeddings Inference. - If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored. - :param normalize: - Normalizes the embeddings to unit length. - Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS` - if the backend uses Text Embeddings Inference. - If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored. - """ - huggingface_hub_import.check() - - if isinstance(api_type, str): - api_type = HFEmbeddingAPIType.from_str(api_type) - - if api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API: - model = api_params.get("model") - if model is None: - raise ValueError( - "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`." - ) - check_valid_model(model, HFModelType.EMBEDDING, token) - model_or_url = model - elif api_type in [HFEmbeddingAPIType.INFERENCE_ENDPOINTS, HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE]: - url = api_params.get("url") - if url is None: - msg = ( - "To use Text Embeddings Inference or Inference Endpoints, you need to specify the `url` " - "parameter in `api_params`." - ) - raise ValueError(msg) - if not is_valid_http_url(url): - raise ValueError(f"Invalid URL: {url}") - model_or_url = url - else: - msg = f"Unknown api_type {api_type}" - raise ValueError(msg) - - self.api_type = api_type - self.api_params = api_params - self.token = token - self.prefix = prefix - self.suffix = suffix - self.truncate = truncate - self.normalize = normalize - self._client = InferenceClient(model_or_url, token=token.resolve_value() if token else None) - self._async_client = AsyncInferenceClient(model_or_url, token=token.resolve_value() if token else None) - - def _prepare_input(self, text: str) -> tuple[str, bool | None, bool | None]: - if not isinstance(text, str): - raise TypeError( - "HuggingFaceAPITextEmbedder expects a string as an input." - "In case you want to embed a list of Documents, please use the HuggingFaceAPIDocumentEmbedder." - ) - - truncate = self.truncate - normalize = self.normalize - - if self.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API: - if truncate is not None: - msg = "`truncate` parameter is not supported for Serverless Inference API. It will be ignored." - logger.warning(msg) - truncate = None - if normalize is not None: - msg = "`normalize` parameter is not supported for Serverless Inference API. It will be ignored." - logger.warning(msg) - normalize = None - - text_to_embed = self.prefix + text + self.suffix - - return text_to_embed, truncate, normalize - - def to_dict(self) -> dict[str, Any]: - """ - Serializes the component to a dictionary. - - :returns: - Dictionary with serialized data. - """ - return default_to_dict( - self, - api_type=str(self.api_type), - api_params=self.api_params, - prefix=self.prefix, - suffix=self.suffix, - token=self.token, - truncate=self.truncate, - normalize=self.normalize, - ) - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "HuggingFaceAPITextEmbedder": - """ - Deserializes the component from a dictionary. - - :param data: - Dictionary to deserialize from. - :returns: - Deserialized component. - """ - return default_from_dict(cls, data) - - @component.output_types(embedding=list[float]) - def run(self, text: str) -> dict[str, Any]: - """ - Embeds a single string. - - :param text: - Text to embed. - - :returns: - A dictionary with the following keys: - - `embedding`: The embedding of the input text. - """ - text_to_embed, truncate_val, normalize_val = self._prepare_input(text) - - np_embedding = self._client.feature_extraction( - text=text_to_embed, truncate=truncate_val, normalize=normalize_val - ) - - error_msg = f"Expected embedding shape (1, embedding_dim) or (embedding_dim,), got {np_embedding.shape}" - if np_embedding.ndim > 2: - raise ValueError(error_msg) - if np_embedding.ndim == 2 and np_embedding.shape[0] != 1: - raise ValueError(error_msg) - - embedding = np_embedding.flatten().tolist() - - return {"embedding": embedding} - - @component.output_types(embedding=list[float]) - async def run_async(self, text: str) -> dict[str, Any]: - """ - Embeds a single string asynchronously. - - :param text: - Text to embed. - - :returns: - A dictionary with the following keys: - - `embedding`: The embedding of the input text. - """ - text_to_embed, truncate_val, normalize_val = self._prepare_input(text) - - np_embedding = await self._async_client.feature_extraction( - text=text_to_embed, truncate=truncate_val, normalize=normalize_val - ) - - error_msg = f"Expected embedding shape (1, embedding_dim) or (embedding_dim,), got {np_embedding.shape}" - if np_embedding.ndim > 2: - raise ValueError(error_msg) - if np_embedding.ndim == 2 and np_embedding.shape[0] != 1: - raise ValueError(error_msg) - - embedding = np_embedding.flatten().tolist() - - return {"embedding": embedding} diff --git a/haystack/components/generators/chat/__init__.py b/haystack/components/generators/chat/__init__.py index 66a2bfa229..fbfe3af243 100644 --- a/haystack/components/generators/chat/__init__.py +++ b/haystack/components/generators/chat/__init__.py @@ -13,7 +13,6 @@ "azure": ["AzureOpenAIChatGenerator"], "azure_responses": ["AzureOpenAIResponsesChatGenerator"], "hugging_face_local": ["HuggingFaceLocalChatGenerator"], - "hugging_face_api": ["HuggingFaceAPIChatGenerator"], "fallback": ["FallbackChatGenerator"], "llm": ["LLM"], } diff --git a/haystack/components/generators/chat/hugging_face_api.py b/haystack/components/generators/chat/hugging_face_api.py deleted file mode 100644 index 83ed222c4d..0000000000 --- a/haystack/components/generators/chat/hugging_face_api.py +++ /dev/null @@ -1,732 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -import json -from collections.abc import AsyncIterable, Iterable -from datetime import datetime -from typing import Any, Union - -from haystack import component, default_from_dict, default_to_dict, logging -from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message, _normalize_messages -from haystack.dataclasses import ( - ChatMessage, - ComponentInfo, - ReasoningContent, - StreamingCallbackT, - StreamingChunk, - SyncStreamingCallbackT, - ToolCall, - select_streaming_callback, -) -from haystack.dataclasses.streaming_chunk import FinishReason, _invoke_streaming_callback -from haystack.lazy_imports import LazyImport -from haystack.tools import ( - ToolsType, - _check_duplicate_tool_names, - deserialize_tools_or_toolset_inplace, - flatten_tools_or_toolsets, - serialize_tools_or_toolset, - warm_up_tools, -) -from haystack.utils import Secret, deserialize_callable, serialize_callable -from haystack.utils.hf import HFGenerationAPIType, HFModelType, check_valid_model, convert_message_to_hf_format -from haystack.utils.url_validation import is_valid_http_url - -logger = logging.getLogger(__name__) - -with LazyImport(message="Run 'pip install \"huggingface_hub[inference]>=0.27.0\"'") as huggingface_hub_import: - from huggingface_hub import ( - AsyncInferenceClient, - ChatCompletionInputFunctionDefinition, - ChatCompletionInputStreamOptions, - ChatCompletionInputTool, - ChatCompletionOutput, - ChatCompletionOutputComplete, - ChatCompletionOutputToolCall, - ChatCompletionStreamOutput, - ChatCompletionStreamOutputChoice, - InferenceClient, - ) - - -def _convert_hfapi_tool_calls(hfapi_tool_calls: list["ChatCompletionOutputToolCall"] | None) -> list[ToolCall]: - """ - Convert HuggingFace API tool calls to a list of Haystack ToolCall. - - :param hfapi_tool_calls: The HuggingFace API tool calls to convert. - :returns: A list of ToolCall objects. - - """ - if not hfapi_tool_calls: - return [] - - tool_calls = [] - - for hfapi_tc in hfapi_tool_calls: - hf_arguments = hfapi_tc.function.arguments - - arguments = None - if isinstance(hf_arguments, dict): - arguments = hf_arguments - elif isinstance(hf_arguments, str): - try: - arguments = json.loads(hf_arguments) - except json.JSONDecodeError: - logger.warning( - "HuggingFace API returned a malformed JSON string for tool call arguments. This tool call " - "will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}", - _id=hfapi_tc.id, - _name=hfapi_tc.function.name, - _arguments=hf_arguments, - ) - else: - logger.warning( - "HuggingFace API returned tool call arguments of type {_type}. Valid types are dict and str. This tool " - "call will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}", - _id=hfapi_tc.id, - _name=hfapi_tc.function.name, - _arguments=hf_arguments, - ) - - if arguments: - tool_calls.append(ToolCall(tool_name=hfapi_tc.function.name, arguments=arguments, id=hfapi_tc.id)) - - return tool_calls - - -def _extract_reasoning_content(message_or_delta: Any) -> ReasoningContent | None: - """ - Extract reasoning content from a HuggingFace API message or delta object. - - :param message_or_delta: The HuggingFace message or delta object that may contain reasoning. - :returns: ReasoningContent if reasoning is present, None otherwise. - """ - if hasattr(message_or_delta, "reasoning") and message_or_delta.reasoning: - return ReasoningContent(reasoning_text=message_or_delta.reasoning) - return None - - -def _resolve_schema_refs(schema: dict[str, Any]) -> dict[str, Any]: - """ - Resolve ``$ref`` references in a JSON schema by inlining ``$defs`` definitions. - - The HuggingFace API does not support ``$defs`` and ``$ref`` in tool parameter schemas. - This function expands all ``$ref`` pointers and removes the ``$defs`` section. - - :param schema: A JSON schema dict potentially containing ``$defs`` and ``$ref``. - :returns: A new schema dict with all references resolved and ``$defs`` removed. - """ - defs = schema.get("$defs", {}) - if not defs: - return schema - - def _resolve(obj: Any, resolving: set[str] | None = None) -> Any: - if resolving is None: - resolving = set() - if isinstance(obj, dict): - if "$ref" in obj: - ref_path = obj["$ref"] - parts = ref_path.split("/") - if len(parts) == 3 and parts[0] == "#" and parts[1] == "$defs": - def_name = parts[2] - if def_name in defs and def_name not in resolving: - return _resolve(defs[def_name], resolving | {def_name}) - return {k: _resolve(v, resolving) for k, v in obj.items() if k != "$defs"} - if isinstance(obj, list): - return [_resolve(item, resolving) for item in obj] - return obj - - return _resolve(schema) - - -def _convert_tools_to_hfapi_tools(tools: ToolsType | None) -> list["ChatCompletionInputTool"] | None: - if not tools: - return None - - # huggingface_hub<0.31.0 uses "arguments", huggingface_hub>=0.31.0 uses "parameters" - parameters_name = "arguments" if hasattr(ChatCompletionInputFunctionDefinition, "arguments") else "parameters" - - hf_tools = [] - for tool in flatten_tools_or_toolsets(tools): - hf_tools_args = { - "name": tool.name, - "description": tool.description, - parameters_name: _resolve_schema_refs(tool.parameters), - } - - hf_tools.append( - ChatCompletionInputTool( - function=ChatCompletionInputFunctionDefinition(**hf_tools_args), # type: ignore[arg-type] - type="function", - ) - ) - - return hf_tools - - -def _map_hf_finish_reason_to_haystack( - choice: Union["ChatCompletionStreamOutputChoice", "ChatCompletionOutputComplete"], -) -> FinishReason | None: - """ - Map HuggingFace finish reasons to Haystack FinishReason literals. - - Uses the full choice object to detect tool calls and provide accurate mapping. - - HuggingFace finish reasons (can be found here https://huggingface.github.io/text-generation-inference/ under - FinishReason): - - "length": number of generated tokens == `max_new_tokens` - - "eos_token": the model generated its end of sequence token - - "stop_sequence": the model generated a text included in `stop_sequences` - - Additionally, detects tool calls from delta.tool_calls or delta.tool_call_id. - - :param choice: The HuggingFace ChatCompletionStreamOutputChoice object. - :returns: The corresponding Haystack FinishReason or None. - """ - if choice.finish_reason is None: - return None - - # Check if this choice contains tool call information - if isinstance(choice, ChatCompletionStreamOutputChoice): - has_tool_calls = choice.delta.tool_calls is not None or choice.delta.tool_call_id is not None - else: - has_tool_calls = choice.message.tool_calls is not None or choice.message.tool_call_id is not None - - # If we detect tool calls, override the finish reason - if has_tool_calls: - return "tool_calls" - - # Map HuggingFace finish reasons to Haystack standard ones - mapping: dict[str, FinishReason] = { - "length": "length", # Direct match - "eos_token": "stop", # EOS token means natural stop - "stop_sequence": "stop", # Stop sequence means natural stop - } - - return mapping.get(choice.finish_reason, "stop") # Default to "stop" for unknown reasons - - -def _convert_chat_completion_stream_output_to_streaming_chunk( - chunk: "ChatCompletionStreamOutput", - previous_chunks: list[StreamingChunk], - component_info: ComponentInfo | None = None, -) -> StreamingChunk: - """ - Converts the Hugging Face API ChatCompletionStreamOutput to a StreamingChunk. - """ - # Choices is empty if include_usage is set to True where the usage information is returned. - if len(chunk.choices) == 0: - usage = None - if chunk.usage: - usage = {"prompt_tokens": chunk.usage.prompt_tokens, "completion_tokens": chunk.usage.completion_tokens} - return StreamingChunk( - content="", - meta={"model": chunk.model, "received_at": datetime.now().isoformat(), "usage": usage}, - component_info=component_info, - ) - - # n is unused, so the API always returns only one choice - # the argument is probably allowed for compatibility with OpenAI - # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n - choice = chunk.choices[0] - mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None - - # Extract reasoning content if present - reasoning = _extract_reasoning_content(choice.delta) - - return StreamingChunk( - content=choice.delta.content or "", - meta={"model": chunk.model, "received_at": datetime.now().isoformat(), "finish_reason": choice.finish_reason}, - component_info=component_info, - # Index must always be 0 since we don't allow tool calls in streaming mode. - index=0 if choice.finish_reason is None else None, - # start is True at the very beginning since first chunk contains role information + first part of the answer. - start=len(previous_chunks) == 0, - finish_reason=mapped_finish_reason, - reasoning=reasoning, - ) - - -@component -class HuggingFaceAPIChatGenerator: - """ - Completes chats using Hugging Face APIs. - - HuggingFaceAPIChatGenerator uses the [ChatMessage](https://docs.haystack.deepset.ai/docs/chatmessage) - format for input and output. Use it to generate text with Hugging Face APIs: - - [Serverless Inference API (Inference Providers)](https://huggingface.co/docs/inference-providers) - - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints) - - [Self-hosted Text Generation Inference](https://github.com/huggingface/text-generation-inference) - - ### Usage examples - - #### With the serverless inference API (Inference Providers) - free tier available - - ```python - from haystack.components.generators.chat import HuggingFaceAPIChatGenerator - from haystack.dataclasses import ChatMessage - from haystack.utils import Secret - from haystack.utils.hf import HFGenerationAPIType - - messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"), - ChatMessage.from_user("What's Natural Language Processing?")] - - # the api_type can be expressed using the HFGenerationAPIType enum or as a string - api_type = HFGenerationAPIType.SERVERLESS_INFERENCE_API - api_type = "serverless_inference_api" # this is equivalent to the above - - generator = HuggingFaceAPIChatGenerator(api_type=api_type, - api_params={"model": "Qwen/Qwen2.5-7B-Instruct", - "provider": "together"}, - token=Secret.from_token("")) - - result = generator.run(messages) - print(result) - ``` - - #### With the serverless inference API (Inference Providers) and text+image input - - ```python - from haystack.components.generators.chat import HuggingFaceAPIChatGenerator - from haystack.dataclasses import ChatMessage, ImageContent - from haystack.utils import Secret - from haystack.utils.hf import HFGenerationAPIType - - # Create an image from file path, URL, or base64 - image = ImageContent.from_file_path("path/to/your/image.jpg") - - # Create a multimodal message with both text and image - messages = [ChatMessage.from_user(content_parts=["Describe this image in detail", image])] - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={ - "model": "Qwen/Qwen2.5-VL-7B-Instruct", # Vision Language Model - "provider": "hyperbolic" - }, - token=Secret.from_token("") - ) - - result = generator.run(messages) - print(result) - ``` - - #### With paid inference endpoints - - ```python - from haystack.components.generators.chat import HuggingFaceAPIChatGenerator - from haystack.dataclasses import ChatMessage - from haystack.utils import Secret - - messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"), - ChatMessage.from_user("What's Natural Language Processing?")] - - generator = HuggingFaceAPIChatGenerator(api_type="inference_endpoints", - api_params={"url": ""}, - token=Secret.from_token("")) - - result = generator.run(messages) - print(result) - ``` - - #### With self-hosted text generation inference - - ```python - from haystack.components.generators.chat import HuggingFaceAPIChatGenerator - from haystack.dataclasses import ChatMessage - - messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"), - ChatMessage.from_user("What's Natural Language Processing?")] - - generator = HuggingFaceAPIChatGenerator(api_type="text_generation_inference", - api_params={"url": "http://localhost:8080"}) - - result = generator.run(messages) - print(result) - ``` - """ - - def __init__( - self, - api_type: HFGenerationAPIType | str, - api_params: dict[str, str], - token: Secret | None = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False), - generation_kwargs: dict[str, Any] | None = None, - stop_words: list[str] | None = None, - streaming_callback: StreamingCallbackT | None = None, - tools: ToolsType | None = None, - ) -> None: - """ - Initialize the HuggingFaceAPIChatGenerator instance. - - :param api_type: - The type of Hugging Face API to use. Available types: - - `text_generation_inference`: See [TGI](https://github.com/huggingface/text-generation-inference). - - `inference_endpoints`: See [Inference Endpoints](https://huggingface.co/inference-endpoints). - - `serverless_inference_api`: See - [Serverless Inference API - Inference Providers](https://huggingface.co/docs/inference-providers). - :param api_params: - A dictionary with the following keys: - - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`. - - `provider`: Provider name. Recommended when `api_type` is `SERVERLESS_INFERENCE_API`. - - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or - `TEXT_GENERATION_INFERENCE`. - - Other parameters specific to the chosen API type, such as `timeout`, `headers`, etc. - :param token: - The Hugging Face token to use as HTTP bearer authorization. - Check your HF token in your [account settings](https://huggingface.co/settings/tokens). - :param generation_kwargs: - A dictionary with keyword arguments to customize text generation. - Some examples: `max_tokens`, `temperature`, `top_p`. - For details, see [Hugging Face chat_completion documentation](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion). - :param stop_words: - An optional list of strings representing the stop words. - :param streaming_callback: - An optional callable for handling streaming responses. - :param tools: - A list of Tool and/or Toolset objects, or a single Toolset for which the model can prepare calls. - The chosen model should support tool/function calling, according to the model card. - Support for tools in the Hugging Face API and TGI is not yet fully refined and you may experience - unexpected behavior. - """ - - huggingface_hub_import.check() - - if isinstance(api_type, str): - api_type = HFGenerationAPIType.from_str(api_type) - - if api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API: - model = api_params.get("model") - if model is None: - raise ValueError( - "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`." - ) - check_valid_model(model, HFModelType.GENERATION, token) - model_or_url = model - elif api_type in [HFGenerationAPIType.INFERENCE_ENDPOINTS, HFGenerationAPIType.TEXT_GENERATION_INFERENCE]: - url = api_params.get("url") - if url is None: - msg = ( - "To use Text Generation Inference or Inference Endpoints, you need to specify the `url` parameter " - "in `api_params`." - ) - raise ValueError(msg) - if not is_valid_http_url(url): - raise ValueError(f"Invalid URL: {url}") - model_or_url = url - else: - msg = f"Unknown api_type {api_type}" - raise ValueError(msg) - - if tools and streaming_callback is not None: - raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.") - _check_duplicate_tool_names(flatten_tools_or_toolsets(tools)) - - # handle generation kwargs setup - generation_kwargs = generation_kwargs.copy() if generation_kwargs else {} - generation_kwargs["stop"] = generation_kwargs.get("stop", []) - generation_kwargs["stop"].extend(stop_words or []) - generation_kwargs.setdefault("max_tokens", 512) - - self.api_type = api_type - self.api_params = api_params - self.token = token - self.generation_kwargs = generation_kwargs - self.streaming_callback = streaming_callback - - resolved_api_params: dict[str, Any] = {k: v for k, v in api_params.items() if k != "model" and k != "url"} - self._client = InferenceClient( - model_or_url, token=token.resolve_value() if token else None, **resolved_api_params - ) - self._async_client = AsyncInferenceClient( - model_or_url, token=token.resolve_value() if token else None, **resolved_api_params - ) - self.tools = tools - self._is_warmed_up = False - - def warm_up(self) -> None: - """ - Warm up the Hugging Face API chat generator. - - This will warm up the tools registered in the chat generator. - This method is idempotent and will only warm up the tools once. - """ - if not self._is_warmed_up: - warm_up_tools(self.tools) - self._is_warmed_up = True - - def to_dict(self) -> dict[str, Any]: - """ - Serialize this component to a dictionary. - - :returns: - A dictionary containing the serialized component. - """ - callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None - return default_to_dict( - self, - api_type=str(self.api_type), - api_params=self.api_params, - token=self.token, - generation_kwargs=self.generation_kwargs, - streaming_callback=callback_name, - tools=serialize_tools_or_toolset(self.tools), - ) - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "HuggingFaceAPIChatGenerator": - """ - Deserialize this component from a dictionary. - """ - deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools") - init_params = data.get("init_parameters", {}) - serialized_callback_handler = init_params.get("streaming_callback") - if serialized_callback_handler: - data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler) - return default_from_dict(cls, data) - - @component.output_types(replies=list[ChatMessage]) - def run( - self, - messages: list[ChatMessage] | str, - generation_kwargs: dict[str, Any] | None = None, - tools: ToolsType | None = None, - streaming_callback: StreamingCallbackT | None = None, - ) -> dict[str, list[ChatMessage]]: - """ - Invoke the text generation inference based on the provided messages and generation parameters. - - :param messages: - A list of ChatMessage objects representing the input messages. If a string is provided, it is converted - to a list containing a ChatMessage with user role. - :param generation_kwargs: - Additional keyword arguments for text generation. - :param tools: - A list of tools or a Toolset for which the model can prepare calls. If set, it will override - the `tools` parameter set during component initialization. This parameter can accept either a - list of `Tool` objects or a `Toolset` instance. - :param streaming_callback: - An optional callable for handling streaming responses. If set, it will override the `streaming_callback` - parameter set during component initialization. - :returns: A dictionary with the following keys: - - `replies`: A list containing the generated responses as ChatMessage objects. - """ - if not self._is_warmed_up: - self.warm_up() - - messages = _normalize_messages(messages) - - # update generation kwargs by merging with the default ones - generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})} - - formatted_messages = [convert_message_to_hf_format(message) for message in messages] - - tools = tools or self.tools - if tools and self.streaming_callback: - raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.") - flat_tools = flatten_tools_or_toolsets(tools) - _check_duplicate_tool_names(flat_tools) - - # validate and select the streaming callback - streaming_callback = select_streaming_callback( - self.streaming_callback, streaming_callback, requires_async=False - ) - - if streaming_callback: - return self._run_streaming(formatted_messages, generation_kwargs, streaming_callback) - - hf_tools = _convert_tools_to_hfapi_tools(tools) - - return self._run_non_streaming(formatted_messages, generation_kwargs, hf_tools) - - @component.output_types(replies=list[ChatMessage]) - async def run_async( - self, - messages: list[ChatMessage] | str, - generation_kwargs: dict[str, Any] | None = None, - tools: ToolsType | None = None, - streaming_callback: StreamingCallbackT | None = None, - ) -> dict[str, list[ChatMessage]]: - """ - Asynchronously invokes the text generation inference based on the provided messages and generation parameters. - - This is the asynchronous version of the `run` method. It has the same parameters - and return values but can be used with `await` in an async code. - - :param messages: - A list of ChatMessage objects representing the input messages. If a string is provided, it is converted - to a list containing a ChatMessage with user role. - :param generation_kwargs: - Additional keyword arguments for text generation. - :param tools: - A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools` - parameter set during component initialization. This parameter can accept either a list of `Tool` objects - or a `Toolset` instance. - :param streaming_callback: - An optional callable for handling streaming responses. If set, it will override the `streaming_callback` - parameter set during component initialization. - :returns: A dictionary with the following keys: - - `replies`: A list containing the generated responses as ChatMessage objects. - """ - if not self._is_warmed_up: - self.warm_up() - - messages = _normalize_messages(messages) - - # update generation kwargs by merging with the default ones - generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})} - - formatted_messages = [convert_message_to_hf_format(message) for message in messages] - - tools = tools or self.tools - if tools and self.streaming_callback: - raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.") - flat_tools = flatten_tools_or_toolsets(tools) - _check_duplicate_tool_names(flat_tools) - - # validate and select the streaming callback - streaming_callback = select_streaming_callback(self.streaming_callback, streaming_callback, requires_async=True) - - if streaming_callback: - return await self._run_streaming_async(formatted_messages, generation_kwargs, streaming_callback) - - hf_tools = _convert_tools_to_hfapi_tools(tools) - - return await self._run_non_streaming_async(formatted_messages, generation_kwargs, hf_tools) - - def _run_streaming( - self, - messages: list[dict[str, str]], - generation_kwargs: dict[str, Any], - streaming_callback: SyncStreamingCallbackT, - ) -> dict[str, list[ChatMessage]]: - api_output: Iterable[ChatCompletionStreamOutput] = self._client.chat_completion( - messages, - stream=True, - stream_options=ChatCompletionInputStreamOptions(include_usage=True), - **generation_kwargs, - ) - - component_info = ComponentInfo.from_component(self) - streaming_chunks: list[StreamingChunk] = [] - for chunk in api_output: - streaming_chunk = _convert_chat_completion_stream_output_to_streaming_chunk( - chunk=chunk, previous_chunks=streaming_chunks, component_info=component_info - ) - streaming_chunks.append(streaming_chunk) - streaming_callback(streaming_chunk) - - message = _convert_streaming_chunks_to_chat_message(chunks=streaming_chunks) - if message.meta.get("usage") is None: - message.meta["usage"] = {"prompt_tokens": 0, "completion_tokens": 0} - - return {"replies": [message]} - - def _run_non_streaming( - self, - messages: list[dict[str, str]], - generation_kwargs: dict[str, Any], - tools: list["ChatCompletionInputTool"] | None = None, - ) -> dict[str, list[ChatMessage]]: - api_chat_output: ChatCompletionOutput = self._client.chat_completion( - messages=messages, tools=tools, **generation_kwargs - ) - - if api_chat_output.choices is None or len(api_chat_output.choices) == 0: - return {"replies": []} - - # n is unused, so the API always returns only one choice - # the argument is probably allowed for compatibility with OpenAI - # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n - choice = api_chat_output.choices[0] - - text = choice.message.content - - tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls) - - # Extract reasoning content if present - reasoning = _extract_reasoning_content(choice.message) - - mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None - meta: dict[str, Any] = { - "model": self._client.model, - "finish_reason": mapped_finish_reason, - "index": choice.index, - } - - usage = {"prompt_tokens": 0, "completion_tokens": 0} - if api_chat_output.usage: - usage = { - "prompt_tokens": api_chat_output.usage.prompt_tokens, - "completion_tokens": api_chat_output.usage.completion_tokens, - } - meta["usage"] = usage - - message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, reasoning=reasoning, meta=meta) - return {"replies": [message]} - - async def _run_streaming_async( - self, messages: list[dict[str, str]], generation_kwargs: dict[str, Any], streaming_callback: StreamingCallbackT - ) -> dict[str, list[ChatMessage]]: - api_output: AsyncIterable[ChatCompletionStreamOutput] = await self._async_client.chat_completion( - messages, - stream=True, - stream_options=ChatCompletionInputStreamOptions(include_usage=True), - **generation_kwargs, - ) - - component_info = ComponentInfo.from_component(self) - streaming_chunks: list[StreamingChunk] = [] - async for chunk in api_output: - stream_chunk = _convert_chat_completion_stream_output_to_streaming_chunk( - chunk=chunk, previous_chunks=streaming_chunks, component_info=component_info - ) - streaming_chunks.append(stream_chunk) - await _invoke_streaming_callback(streaming_callback, stream_chunk) - - message = _convert_streaming_chunks_to_chat_message(chunks=streaming_chunks) - if message.meta.get("usage") is None: - message.meta["usage"] = {"prompt_tokens": 0, "completion_tokens": 0} - - return {"replies": [message]} - - async def _run_non_streaming_async( - self, - messages: list[dict[str, str]], - generation_kwargs: dict[str, Any], - tools: list["ChatCompletionInputTool"] | None = None, - ) -> dict[str, list[ChatMessage]]: - api_chat_output: ChatCompletionOutput = await self._async_client.chat_completion( - messages=messages, tools=tools, **generation_kwargs - ) - - if api_chat_output.choices is None or len(api_chat_output.choices) == 0: - return {"replies": []} - - choice = api_chat_output.choices[0] - - text = choice.message.content - - tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls) - - # Extract reasoning content if present - reasoning = _extract_reasoning_content(choice.message) - - mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None - meta: dict[str, Any] = { - "model": self._async_client.model, - "finish_reason": mapped_finish_reason, - "index": choice.index, - } - - usage = {"prompt_tokens": 0, "completion_tokens": 0} - if api_chat_output.usage: - usage = { - "prompt_tokens": api_chat_output.usage.prompt_tokens, - "completion_tokens": api_chat_output.usage.completion_tokens, - } - meta["usage"] = usage - - message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, reasoning=reasoning, meta=meta) - return {"replies": [message]} diff --git a/haystack/components/rankers/__init__.py b/haystack/components/rankers/__init__.py index 7960d70384..7134043f8d 100644 --- a/haystack/components/rankers/__init__.py +++ b/haystack/components/rankers/__init__.py @@ -8,7 +8,6 @@ from lazy_imports import LazyImporter _import_structure = { - "hugging_face_tei": ["HuggingFaceTEIRanker"], "llm_ranker": ["LLMRanker"], "lost_in_the_middle": ["LostInTheMiddleRanker"], "meta_field": ["MetaFieldRanker"], @@ -19,7 +18,6 @@ } if TYPE_CHECKING: - from .hugging_face_tei import HuggingFaceTEIRanker as HuggingFaceTEIRanker from .llm_ranker import LLMRanker as LLMRanker from .lost_in_the_middle import LostInTheMiddleRanker as LostInTheMiddleRanker from .meta_field import MetaFieldRanker as MetaFieldRanker diff --git a/haystack/components/rankers/hugging_face_tei.py b/haystack/components/rankers/hugging_face_tei.py deleted file mode 100644 index 5e5957854f..0000000000 --- a/haystack/components/rankers/hugging_face_tei.py +++ /dev/null @@ -1,293 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -from dataclasses import replace -from enum import Enum -from typing import Any -from urllib.parse import urljoin - -import httpx - -from haystack import Document, component, default_from_dict, default_to_dict -from haystack.utils import Secret -from haystack.utils.misc import _deduplicate_documents -from haystack.utils.requests_utils import async_request_with_retry, request_with_retry - - -class TruncationDirection(str, Enum): - """ - Defines the direction to truncate text when input length exceeds the model's limit. - - Attributes: - LEFT: Truncate text from the left side (start of text). - RIGHT: Truncate text from the right side (end of text). - """ - - LEFT = "Left" - RIGHT = "Right" - - -@component -class HuggingFaceTEIRanker: - """ - Ranks documents based on their semantic similarity to the query. - - It can be used with a Text Embeddings Inference (TEI) API endpoint: - - [Self-hosted Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference) - - [Hugging Face Inference Endpoints](https://huggingface.co/inference-endpoints) - - Usage example: - - ```python - from haystack import Document - from haystack.components.rankers import HuggingFaceTEIRanker - from haystack.utils import Secret - - reranker = HuggingFaceTEIRanker( - url="http://localhost:8080", - top_k=5, - timeout=30, - token=Secret.from_token("my_api_token") - ) - - docs = [Document(content="The capital of France is Paris"), Document(content="The capital of Germany is Berlin")] - - result = reranker.run(query="What is the capital of France?", documents=docs) - - ranked_docs = result["documents"] - print(ranked_docs) - # >> {'documents': [Document(id=..., content: 'the capital of France is Paris', score: 0.9979767), - # >> Document(id=..., content: 'the capital of Germany is Berlin', score: 0.13982213)]} - ``` - """ - - def __init__( - self, - *, - url: str, - top_k: int = 10, - raw_scores: bool = False, - timeout: int | None = 30, - max_retries: int = 3, - retry_status_codes: list[int] | None = None, - token: Secret | None = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False), - ) -> None: - """ - Initializes the TEI reranker component. - - :param url: Base URL of the TEI reranking service (for example, "https://api.example.com"). - :param top_k: Maximum number of top documents to return. - :param raw_scores: If True, include raw relevance scores in the API payload. - :param timeout: Request timeout in seconds. - :param max_retries: Maximum number of retry attempts for failed requests. - :param retry_status_codes: List of HTTP status codes that will trigger a retry. - When None, HTTP 408, 418, 429 and 503 will be retried (default: None). - :param token: The Hugging Face token to use as HTTP bearer authorization. Not always required - depending on your TEI server configuration. - Check your HF token in your [account settings](https://huggingface.co/settings/tokens). - """ - self.url = url - self.top_k = top_k - self.timeout = timeout - self.token = token - self.max_retries = max_retries - self.retry_status_codes = retry_status_codes - self.raw_scores = raw_scores - - def to_dict(self) -> dict[str, Any]: - """ - Serializes the component to a dictionary. - - :returns: - Dictionary with serialized data. - """ - return default_to_dict( - self, - url=self.url, - top_k=self.top_k, - timeout=self.timeout, - token=self.token, - max_retries=self.max_retries, - retry_status_codes=self.retry_status_codes, - ) - - @classmethod - def from_dict(cls, data: dict[str, Any]) -> "HuggingFaceTEIRanker": - """ - Deserializes the component from a dictionary. - - :param data: - Dictionary to deserialize from. - :returns: - Deserialized component. - """ - return default_from_dict(cls, data) - - def _compose_response( - self, result: dict[str, str] | list[dict[str, Any]], top_k: int | None, documents: list[Document] - ) -> dict[str, list[Document]]: - """ - Processes the API response into a structured format. - - :param result: The raw response from the API. - - :returns: A dictionary with the following keys: - - `documents`: A list of reranked documents. - - :raises RuntimeError: - - If the API request fails. - - :raises RuntimeError: - - If the API returns an error response. - - :raises TypeError: - - If the API response is not in the expected list format. - """ - if isinstance(result, dict) and "error" in result: - error_type = result.get("error_type", "UnknownError") - error_msg = result.get("error", "No additional information.") - raise RuntimeError(f"HuggingFaceTEIRanker API call failed ({error_type}): {error_msg}") - - # Ensure we have a list of score dicts - if not isinstance(result, list): - # Expected list or dict, but encountered an unknown response format. - error_msg = f"Expected a list of score dictionaries, but got `{type(result).__name__}`. " - error_msg += f"Response content: {result}" - raise TypeError(f"Unexpected response format from text-embeddings-inference rerank API: {error_msg}") - - # Determine number of docs to return - final_k = min(top_k or self.top_k, len(result)) - - # Select and return the top_k documents - ranked_docs = [] - for item in result[:final_k]: - index: int = item["index"] - ranked_docs.append(replace(documents[index], score=item["score"])) - return {"documents": ranked_docs} - - @component.output_types(documents=list[Document]) - def run( - self, - query: str, - documents: list[Document], - top_k: int | None = None, - truncation_direction: TruncationDirection | None = None, - ) -> dict[str, list[Document]]: - """ - Reranks the provided documents by relevance to the query using the TEI API. - - Before ranking, documents are deduplicated by their id, retaining only the document with the highest score - if a score is present. - - :param query: The user query string to guide reranking. - :param documents: List of `Document` objects to rerank. - :param top_k: Optional override for the maximum number of documents to return. - :param truncation_direction: If set, enables text truncation in the specified direction. - - :returns: A dictionary with the following keys: - - `documents`: A list of reranked documents. - - :raises RuntimeError: - - If the API request fails. - - :raises RuntimeError: - - If the API returns an error response. - - :raises TypeError: - - If the API response is not in the expected list format. - """ - # Return empty if no documents provided - if not documents: - return {"documents": []} - - # Prepare the payload - deduplicated_documents = _deduplicate_documents(documents) - texts = [doc.content for doc in deduplicated_documents] - payload: dict[str, Any] = {"query": query, "texts": texts, "raw_scores": self.raw_scores} - if truncation_direction: - payload.update({"truncate": True, "truncation_direction": truncation_direction.value}) - - headers = {} - if self.token and self.token.resolve_value(): - headers["Authorization"] = f"Bearer {self.token.resolve_value()}" - - # Call the external service with retry - try: - response = request_with_retry( - method="POST", - url=urljoin(self.url, "/rerank"), - json=payload, - timeout=self.timeout, - headers=headers, - attempts=self.max_retries, - status_codes_to_retry=self.retry_status_codes, - ) - except httpx.HTTPStatusError as e: - raise RuntimeError(f"HuggingFaceTEIRanker API call failed. Error: {e}, Response: {e.response.text}") from e - - result: dict[str, str] | list[dict[str, Any]] = response.json() - - return self._compose_response(result, top_k, deduplicated_documents) - - @component.output_types(documents=list[Document]) - async def run_async( - self, - query: str, - documents: list[Document], - top_k: int | None = None, - truncation_direction: TruncationDirection | None = None, - ) -> dict[str, list[Document]]: - """ - Asynchronously reranks the provided documents by relevance to the query using the TEI API. - - Before ranking, documents are deduplicated by their id, retaining only the document with the highest score - if a score is present. - - :param query: The user query string to guide reranking. - :param documents: List of `Document` objects to rerank. - :param top_k: Optional override for the maximum number of documents to return. - :param truncation_direction: If set, enables text truncation in the specified direction. - - :returns: A dictionary with the following keys: - - `documents`: A list of reranked documents. - - :raises httpx.RequestError: - - If the API request fails. - :raises RuntimeError: - - If the API returns an error response. - :raises TypeError: - - If the API response is not in the expected list format. - """ - # Return empty if no documents provided - if not documents: - return {"documents": []} - - # Prepare the payload - deduplicated_documents = _deduplicate_documents(documents) - texts = [doc.content for doc in deduplicated_documents] - payload: dict[str, Any] = {"query": query, "texts": texts, "raw_scores": self.raw_scores} - if truncation_direction: - payload.update({"truncate": True, "truncation_direction": truncation_direction.value}) - - headers = {} - if self.token and self.token.resolve_value(): - headers["Authorization"] = f"Bearer {self.token.resolve_value()}" - - # Call the external service with retry - try: - response = await async_request_with_retry( - method="POST", - url=urljoin(self.url, "/rerank"), - json=payload, - timeout=self.timeout, - headers=headers, - attempts=self.max_retries, - status_codes_to_retry=self.retry_status_codes, - ) - except httpx.HTTPStatusError as e: - raise RuntimeError(f"HuggingFaceTEIRanker API call failed. Error: {e}, Response: {e.response.text}") from e - - result: dict[str, str] | list[dict[str, Any]] = response.json() - - return self._compose_response(result, top_k, deduplicated_documents) diff --git a/haystack/utils/hf.py b/haystack/utils/hf.py index c4bcf2a8e9..faa2606a36 100644 --- a/haystack/utils/hf.py +++ b/haystack/utils/hf.py @@ -4,7 +4,6 @@ import asyncio import copy -from enum import Enum from typing import Any from haystack import logging @@ -27,84 +26,11 @@ import torch with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import: - from huggingface_hub import HfApi, model_info - from huggingface_hub.utils import RepositoryNotFoundError + from huggingface_hub import model_info logger = logging.getLogger(__name__) -class HFGenerationAPIType(Enum): - """ - API type to use for Hugging Face API Generators. - """ - - # HF [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference). - TEXT_GENERATION_INFERENCE = "text_generation_inference" - - # HF [Inference Endpoints](https://huggingface.co/inference-endpoints). - INFERENCE_ENDPOINTS = "inference_endpoints" - - # HF [Serverless Inference API](https://huggingface.co/inference-api). - SERVERLESS_INFERENCE_API = "serverless_inference_api" - - def __str__(self) -> str: - return self.value - - @staticmethod - def from_str(string: str) -> "HFGenerationAPIType": - """ - Convert a string to a HFGenerationAPIType enum. - - :param string: The string to convert. - :return: The corresponding HFGenerationAPIType enum. - - """ - enum_map = {e.value: e for e in HFGenerationAPIType} - mode = enum_map.get(string) - if mode is None: - msg = f"Unknown Hugging Face API type '{string}'. Supported types are: {list(enum_map.keys())}" - raise ValueError(msg) - return mode - - -class HFEmbeddingAPIType(Enum): - """ - API type to use for Hugging Face API Embedders. - """ - - # HF [Text Embeddings Inference (TEI)](https://github.com/huggingface/text-embeddings-inference). - TEXT_EMBEDDINGS_INFERENCE = "text_embeddings_inference" - - # HF [Inference Endpoints](https://huggingface.co/inference-endpoints). - INFERENCE_ENDPOINTS = "inference_endpoints" - - # HF [Serverless Inference API](https://huggingface.co/inference-api). - SERVERLESS_INFERENCE_API = "serverless_inference_api" - - def __str__(self) -> str: - return self.value - - @staticmethod - def from_str(string: str) -> "HFEmbeddingAPIType": - """ - Convert a string to a HFEmbeddingAPIType enum. - - :param string: - :return: The corresponding HFEmbeddingAPIType enum. - """ - enum_map = {e.value: e for e in HFEmbeddingAPIType} - mode = enum_map.get(string) - if mode is None: - msg = f"Unknown Hugging Face API type '{string}'. Supported types are: {list(enum_map.keys())}" - raise ValueError(msg) - return mode - - -class HFModelType(Enum): - EMBEDDING = 1 - GENERATION = 2 - - def serialize_hf_model_kwargs(kwargs: dict[str, Any]) -> None: """ Recursively serialize HuggingFace specific model keyword arguments in-place to make them JSON serializable. @@ -220,41 +146,6 @@ def resolve_hf_pipeline_kwargs( return huggingface_pipeline_kwargs -def check_valid_model(model_id: str, model_type: HFModelType, token: Secret | None) -> None: - """ - Check if the provided model ID corresponds to a valid model on HuggingFace Hub. - - Also check if the model is an embedding or generation model. - - :param model_id: A string representing the HuggingFace model ID. - :param model_type: the model type, HFModelType.EMBEDDING or HFModelType.GENERATION - :param token: The optional authentication token. - :raises ValueError: If the model is not found or is not a embedding model. - """ - huggingface_hub_import.check() - - api = HfApi() - try: - model_info = api.model_info(model_id, token=token.resolve_value() if token else None) - except RepositoryNotFoundError as e: - raise ValueError( - f"Model {model_id} not found on HuggingFace Hub. Please provide a valid HuggingFace model_id." - ) from e - - if model_type == HFModelType.EMBEDDING: - allowed_model = model_info.pipeline_tag in ["sentence-similarity", "feature-extraction"] - error_msg = f"Model {model_id} is not a embedding model. Please provide a embedding model." - elif model_type == HFModelType.GENERATION: - allowed_model = model_info.pipeline_tag in ["text-generation", "text2text-generation", "image-text-to-text"] - error_msg = f"Model {model_id} is not a text generation model. Please provide a text generation model." - else: - allowed_model = False - error_msg = f"Unknown model type for {model_id}" - - if not allowed_model: - raise ValueError(error_msg) - - def convert_message_to_hf_format(message: ChatMessage) -> dict[str, Any]: """ Convert a message to the format expected by Hugging Face. diff --git a/pydoc/embedders_api.yml b/pydoc/embedders_api.yml index d97773efef..4533d36049 100644 --- a/pydoc/embedders_api.yml +++ b/pydoc/embedders_api.yml @@ -1,10 +1,17 @@ loaders: - search_path: [../haystack/components/embedders] - modules: ["azure_document_embedder", "azure_text_embedder", "hugging_face_api_document_embedder", - "hugging_face_api_text_embedder", "openai_document_embedder", "openai_text_embedder", - "sentence_transformers_document_embedder", "sentence_transformers_text_embedder", - "sentence_transformers_sparse_document_embedder", "sentence_transformers_sparse_text_embedder", - "image/sentence_transformers_doc_image_embedder"] + modules: + [ + "azure_document_embedder", + "azure_text_embedder", + "openai_document_embedder", + "openai_text_embedder", + "sentence_transformers_document_embedder", + "sentence_transformers_text_embedder", + "sentence_transformers_sparse_document_embedder", + "sentence_transformers_sparse_text_embedder", + "image/sentence_transformers_doc_image_embedder", + ] processors: - type: filter documented_only: true diff --git a/pydoc/generators_api.yml b/pydoc/generators_api.yml index 198d2d3120..20af90637f 100644 --- a/pydoc/generators_api.yml +++ b/pydoc/generators_api.yml @@ -5,7 +5,6 @@ loaders: "chat/azure", "chat/azure_responses", "chat/fallback", - "chat/hugging_face_api", "chat/hugging_face_local", "chat/llm", "chat/openai", diff --git a/pydoc/rankers_api.yml b/pydoc/rankers_api.yml index a70d4ec823..d4b5015f95 100644 --- a/pydoc/rankers_api.yml +++ b/pydoc/rankers_api.yml @@ -1,6 +1,6 @@ loaders: - search_path: [../haystack/components/rankers] - modules: ["hugging_face_tei", "llm_ranker", "lost_in_the_middle", "meta_field", "meta_field_grouping_ranker", + modules: ["llm_ranker", "lost_in_the_middle", "meta_field", "meta_field_grouping_ranker", "sentence_transformers_diversity", "sentence_transformers_similarity", "transformers_similarity"] processors: - type: filter diff --git a/pyproject.toml b/pyproject.toml index 888e9a770c..579435a89f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -93,7 +93,6 @@ dependencies = [ # EvaluationRunResult, XLSXToDocument, and pipeline tests "transformers[torch, sentencepiece]>=4.57", # ExtractiveReader, TransformersSimilarityRanker, LocalWhisperTranscriber, HFGenerators... - "huggingface_hub>=0.27.0", # Hugging Face API Generators and Embedders "sentence-transformers>=5.0.0", # Sentence Transformers Embedders, Rankers, and SASEvaluator "langdetect", # TextLanguageRouter and DocumentLanguageClassifier "openai-whisper>=20231106", # LocalWhisperTranscriber diff --git a/releasenotes/notes/remove-hf-api-components-97bb895a321bc3ab.yaml b/releasenotes/notes/remove-hf-api-components-97bb895a321bc3ab.yaml new file mode 100644 index 0000000000..a0ee623c88 --- /dev/null +++ b/releasenotes/notes/remove-hf-api-components-97bb895a321bc3ab.yaml @@ -0,0 +1,25 @@ +--- +upgrade: + - | + ``HuggingFaceAPIChatGenerator``, ``HuggingFaceAPITextEmbedder``, ``HuggingFaceAPIDocumentEmbedder``, and + ``HuggingFaceTEIRanker`` have been moved out of Haystack into the ``huggingface-api-haystack`` integration + package. Install the new package with ``pip install huggingface-api-haystack`` and update your imports. + + Before: + + .. code:: python + + from haystack.components.generators.chat import HuggingFaceAPIChatGenerator + from haystack.components.embedders import HuggingFaceAPITextEmbedder, HuggingFaceAPIDocumentEmbedder + from haystack.components.rankers import HuggingFaceTEIRanker + + After: + + .. code:: python + + from haystack_integrations.components.generators.huggingface_api import HuggingFaceAPIChatGenerator + from haystack_integrations.components.embedders.huggingface_api import ( + HuggingFaceAPITextEmbedder, + HuggingFaceAPIDocumentEmbedder, + ) + from haystack_integrations.components.rankers.huggingface_api import HuggingFaceTEIRanker diff --git a/test/components/embedders/test_hugging_face_api_document_embedder.py b/test/components/embedders/test_hugging_face_api_document_embedder.py deleted file mode 100644 index 477123ae51..0000000000 --- a/test/components/embedders/test_hugging_face_api_document_embedder.py +++ /dev/null @@ -1,604 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -import os -import random -import sys -from unittest.mock import MagicMock, patch - -import pytest -from huggingface_hub.utils import RepositoryNotFoundError -from numpy import array - -from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder -from haystack.dataclasses import Document -from haystack.utils.auth import Secret -from haystack.utils.hf import HFEmbeddingAPIType - - -@pytest.fixture -def mock_check_valid_model(): - with patch( - "haystack.components.embedders.hugging_face_api_document_embedder.check_valid_model", - MagicMock(return_value=None), - ) as mock: - yield mock - - -def mock_embedding_generation(text, **kwargs): - return array([[random.random() for _ in range(384)] for _ in range(len(text))]) - - -class TestHuggingFaceAPIDocumentEmbedder: - def test_init_invalid_api_type(self): - with pytest.raises(ValueError): - HuggingFaceAPIDocumentEmbedder(api_type="invalid_api_type", api_params={}) - - def test_init_serverless(self, mock_check_valid_model): - model = "BAAI/bge-small-en-v1.5" - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": model} - ) - - assert embedder.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API - assert embedder.api_params == {"model": model} - assert embedder.prefix == "" - assert embedder.suffix == "" - assert embedder.truncate - assert not embedder.normalize - assert embedder.batch_size == 32 - assert embedder.progress_bar - assert embedder.meta_fields_to_embed == [] - assert embedder.embedding_separator == "\n" - - def test_init_serverless_invalid_model(self, mock_check_valid_model): - mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id", response=MagicMock()) - with pytest.raises(RepositoryNotFoundError): - HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "invalid_model_id"} - ) - - def test_init_serverless_no_model(self): - with pytest.raises(ValueError): - HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"param": "irrelevant"} - ) - - def test_init_tei(self): - url = "https://some_model.com" - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"url": url} - ) - - assert embedder.api_type == HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE - assert embedder.api_params == {"url": url} - assert embedder.prefix == "" - assert embedder.suffix == "" - assert embedder.truncate - assert not embedder.normalize - assert embedder.batch_size == 32 - assert embedder.progress_bar - assert embedder.meta_fields_to_embed == [] - assert embedder.embedding_separator == "\n" - - def test_init_tei_invalid_url(self): - with pytest.raises(ValueError): - HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"url": "invalid_url"} - ) - - def test_init_tei_no_url(self): - with pytest.raises(ValueError): - HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"param": "irrelevant"} - ) - - def test_to_dict(self, mock_check_valid_model): - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - prefix="prefix", - suffix="suffix", - truncate=False, - normalize=True, - batch_size=128, - progress_bar=False, - meta_fields_to_embed=["meta_field"], - embedding_separator=" ", - concurrency_limit=7, - ) - - data = embedder.to_dict() - - assert data == { - "type": "haystack.components.embedders.hugging_face_api_document_embedder.HuggingFaceAPIDocumentEmbedder", - "init_parameters": { - "api_type": "serverless_inference_api", - "api_params": {"model": "BAAI/bge-small-en-v1.5"}, - "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, - "prefix": "prefix", - "suffix": "suffix", - "truncate": False, - "normalize": True, - "batch_size": 128, - "progress_bar": False, - "meta_fields_to_embed": ["meta_field"], - "embedding_separator": " ", - "concurrency_limit": 7, - }, - } - - def test_from_dict(self, mock_check_valid_model): - data = { - "type": "haystack.components.embedders.hugging_face_api_document_embedder.HuggingFaceAPIDocumentEmbedder", - "init_parameters": { - "api_type": HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - "api_params": {"model": "BAAI/bge-small-en-v1.5"}, - "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, - "prefix": "prefix", - "suffix": "suffix", - "truncate": False, - "normalize": True, - "batch_size": 128, - "progress_bar": False, - "meta_fields_to_embed": ["meta_field"], - "embedding_separator": " ", - "concurrency_limit": 7, - }, - } - - embedder = HuggingFaceAPIDocumentEmbedder.from_dict(data) - - assert embedder.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API - assert embedder.api_params == {"model": "BAAI/bge-small-en-v1.5"} - assert embedder.prefix == "prefix" - assert embedder.suffix == "suffix" - assert not embedder.truncate - assert embedder.normalize - assert embedder.batch_size == 128 - assert not embedder.progress_bar - assert embedder.meta_fields_to_embed == ["meta_field"] - assert embedder.embedding_separator == " " - assert embedder.concurrency_limit == 7 - - def test_prepare_texts_to_embed_w_metadata(self): - documents = [ - Document(content=f"document number {i}: content", meta={"meta_field": f"meta_value {i}"}) for i in range(5) - ] - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, - api_params={"url": "https://some_model.com"}, - token=Secret.from_token("fake-api-token"), - meta_fields_to_embed=["meta_field"], - embedding_separator=" | ", - ) - - prepared_texts = embedder._prepare_texts_to_embed(documents) - - assert prepared_texts == [ - "meta_value 0 | document number 0: content", - "meta_value 1 | document number 1: content", - "meta_value 2 | document number 2: content", - "meta_value 3 | document number 3: content", - "meta_value 4 | document number 4: content", - ] - - def test_prepare_texts_to_embed_w_suffix(self, mock_check_valid_model): - documents = [Document(content=f"document number {i}") for i in range(5)] - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, - api_params={"url": "https://some_model.com"}, - token=Secret.from_token("fake-api-token"), - prefix="my_prefix ", - suffix=" my_suffix", - ) - - prepared_texts = embedder._prepare_texts_to_embed(documents) - - assert prepared_texts == [ - "my_prefix document number 0 my_suffix", - "my_prefix document number 1 my_suffix", - "my_prefix document number 2 my_suffix", - "my_prefix document number 3 my_suffix", - "my_prefix document number 4 my_suffix", - ] - - def test_embed_batch(self, mock_check_valid_model, caplog): - texts = ["text 1", "text 2", "text 3", "text 4", "text 5"] - - with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.side_effect = mock_embedding_generation - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - ) - embeddings = embedder._embed_batch(texts_to_embed=texts, batch_size=2) - - assert mock_embedding_patch.call_count == 3 - - assert isinstance(embeddings, list) - assert len(embeddings) == len(texts) - for embedding in embeddings: - assert isinstance(embedding, list) - assert len(embedding) == 384 - assert all(isinstance(x, float) for x in embedding) - - # Check that logger warnings about ignoring truncate and normalize are raised - assert len(caplog.records) == 2 - assert "truncate" in caplog.records[0].message - assert "normalize" in caplog.records[1].message - - def test_embed_batch_wrong_embedding_shape(self, mock_check_valid_model): - texts = ["text 1", "text 2", "text 3", "text 4", "text 5"] - - # embedding ndim != 2 - with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.return_value = array([0.1, 0.2, 0.3]) - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - ) - - with pytest.raises(ValueError): - embedder._embed_batch(texts_to_embed=texts, batch_size=2) - - # embedding ndim == 2 but shape[0] != len(batch) - with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.return_value = array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]) - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - ) - - with pytest.raises(ValueError): - embedder._embed_batch(texts_to_embed=texts, batch_size=2) - - def test_run_wrong_input_format(self, mock_check_valid_model): - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "BAAI/bge-small-en-v1.5"} - ) - - list_integers_input = [1, 2, 3] - - with pytest.raises(TypeError): - embedder.run(text=list_integers_input) - - def test_run_on_empty_list(self, mock_check_valid_model): - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - ) - - empty_list_input = [] - result = embedder.run(documents=empty_list_input) - - assert result["documents"] is not None - assert not result["documents"] # empty list - - def test_run(self, mock_check_valid_model): - docs = [ - Document(content="I love cheese", meta={"topic": "Cuisine"}), - Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), - ] - with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.side_effect = mock_embedding_generation - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - prefix="prefix ", - suffix=" suffix", - meta_fields_to_embed=["topic"], - embedding_separator=" | ", - ) - - result = embedder.run(documents=docs) - - mock_embedding_patch.assert_called_once_with( - text=[ - "prefix Cuisine | I love cheese suffix", - "prefix ML | A transformer is a deep learning architecture suffix", - ], - truncate=None, - normalize=None, - ) - - documents_with_embeddings = result["documents"] - - assert isinstance(documents_with_embeddings, list) - assert len(documents_with_embeddings) == len(docs) - for doc, new_doc in zip(docs, documents_with_embeddings, strict=True): - assert doc.embedding is None - assert new_doc is not doc - assert isinstance(new_doc, Document) - assert isinstance(new_doc.embedding, list) - assert len(new_doc.embedding) == 384 - assert all(isinstance(x, float) for x in new_doc.embedding) - - def test_run_custom_batch_size(self, mock_check_valid_model): - docs = [ - Document(content="I love cheese", meta={"topic": "Cuisine"}), - Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), - ] - - with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.side_effect = mock_embedding_generation - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - prefix="prefix ", - suffix=" suffix", - meta_fields_to_embed=["topic"], - embedding_separator=" | ", - batch_size=1, - ) - - result = embedder.run(documents=docs) - - assert mock_embedding_patch.call_count == 2 - - documents_with_embeddings = result["documents"] - - assert isinstance(documents_with_embeddings, list) - assert len(documents_with_embeddings) == len(docs) - for doc in documents_with_embeddings: - assert isinstance(doc, Document) - assert isinstance(doc.embedding, list) - assert len(doc.embedding) == 384 - assert all(isinstance(x, float) for x in doc.embedding) - - def test_adjust_api_parameters(self): - truncate, normalize = HuggingFaceAPIDocumentEmbedder._adjust_api_parameters( - True, False, HFEmbeddingAPIType.SERVERLESS_INFERENCE_API - ) - assert truncate is None - assert normalize is None - - truncate, normalize = HuggingFaceAPIDocumentEmbedder._adjust_api_parameters( - True, False, HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE - ) - assert truncate is True - assert normalize is False - - @pytest.mark.integration - @pytest.mark.slow - @pytest.mark.skipif( - not os.environ.get("HF_API_TOKEN", None), - reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", - ) - @pytest.mark.flaky(reruns=3, reruns_delay=10) - @pytest.mark.skipif(sys.platform != "linux", reason="We only test on Linux to avoid overloading the HF server") - @pytest.mark.xfail(reason="hf-inference is temporarily returning 500s") - def test_live_run_serverless(self): - docs = [ - Document(content="I love cheese", meta={"topic": "Cuisine"}), - Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), - ] - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "sentence-transformers/all-MiniLM-L6-v2"}, - meta_fields_to_embed=["topic"], - embedding_separator=" | ", - ) - embedder._client.timeout = 10 # we want to fail fast if the server is not responding - result = embedder.run(documents=docs) - documents_with_embeddings = result["documents"] - - assert isinstance(documents_with_embeddings, list) - assert len(documents_with_embeddings) == len(docs) - for doc in documents_with_embeddings: - assert isinstance(doc, Document) - assert isinstance(doc.embedding, list) - assert len(doc.embedding) == 384 - assert all(isinstance(x, float) for x in doc.embedding) - - @pytest.mark.asyncio - @pytest.mark.integration - @pytest.mark.slow - @pytest.mark.skipif( - not os.environ.get("HF_API_TOKEN", None), - reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", - ) - @pytest.mark.flaky(reruns=3, reruns_delay=10) - @pytest.mark.skipif(sys.platform != "linux", reason="We only test on Linux to avoid overloading the HF server") - @pytest.mark.xfail(reason="hf-inference is temporarily returning 500s") - async def test_live_run_serverless_async(self) -> None: - docs = [ - Document(content="I love cheese", meta={"topic": "Cuisine"}), - Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), - ] - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "sentence-transformers/all-MiniLM-L6-v2"}, - meta_fields_to_embed=["topic"], - embedding_separator=" | ", - ) - embedder._async_client.timeout = 10 # we want to fail fast if the server is not responding - result = await embedder.run_async(documents=docs) - documents_with_embeddings = result["documents"] - - assert isinstance(documents_with_embeddings, list) - assert len(documents_with_embeddings) == len(docs) - for doc in documents_with_embeddings: - assert isinstance(doc, Document) - assert isinstance(doc.embedding, list) - assert len(doc.embedding) == 384 - assert all(isinstance(x, float) for x in doc.embedding) - - @pytest.mark.asyncio - async def test_embed_batch_async(self, mock_check_valid_model, caplog): - texts = ["text 1", "text 2", "text 3", "text 4", "text 5"] - - with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.side_effect = mock_embedding_generation - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - concurrency_limit=4, - ) - embeddings = await embedder._embed_batch_async(texts_to_embed=texts, batch_size=2) - - assert mock_embedding_patch.call_count == 3 - - assert isinstance(embeddings, list) - assert len(embeddings) == len(texts) - for embedding in embeddings: - assert isinstance(embedding, list) - assert len(embedding) == 384 - assert all(isinstance(x, float) for x in embedding) - - # Check that logger warnings about ignoring truncate and normalize are raised - assert len(caplog.records) == 2 - assert "truncate" in caplog.records[0].message - assert "normalize" in caplog.records[1].message - - @pytest.mark.asyncio - async def test_embed_batch_async_wrong_embedding_shape(self, mock_check_valid_model): - texts = ["text 1", "text 2", "text 3", "text 4", "text 5"] - - # embedding ndim != 2 - with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.return_value = array([0.1, 0.2, 0.3]) - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - concurrency_limit=1, - ) - - with pytest.raises(ValueError): - await embedder._embed_batch_async(texts_to_embed=texts, batch_size=2) - - # embedding ndim == 2 but shape[0] != len(batch) - with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.return_value = array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]) - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - concurrency_limit=1, - ) - - with pytest.raises(ValueError): - await embedder._embed_batch_async(texts_to_embed=texts, batch_size=2) - - @pytest.mark.asyncio - async def test_run_async_wrong_input_format(self, mock_check_valid_model): - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "BAAI/bge-small-en-v1.5"} - ) - - list_integers_input = [1, 2, 3] - - with pytest.raises(TypeError): - await embedder.run_async(text=list_integers_input) - - @pytest.mark.asyncio - async def test_run_async_on_empty_list(self, mock_check_valid_model): - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - ) - - empty_list_input = [] - result = await embedder.run_async(documents=empty_list_input) - - assert result["documents"] is not None - assert not result["documents"] # empty list. - - @pytest.mark.asyncio - async def test_run_async(self, mock_check_valid_model): - docs = [ - Document(content="I love cheese", meta={"topic": "Cuisine"}), - Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), - ] - - with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.side_effect = mock_embedding_generation - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - prefix="prefix ", - suffix=" suffix", - meta_fields_to_embed=["topic"], - embedding_separator=" | ", - ) - - result = await embedder.run_async(documents=docs) - - mock_embedding_patch.assert_called_once_with( - text=[ - "prefix Cuisine | I love cheese suffix", - "prefix ML | A transformer is a deep learning architecture suffix", - ], - truncate=None, - normalize=None, - ) - - documents_with_embeddings = result["documents"] - - assert isinstance(documents_with_embeddings, list) - assert len(documents_with_embeddings) == len(docs) - for doc in documents_with_embeddings: - assert isinstance(doc, Document) - assert isinstance(doc.embedding, list) - assert len(doc.embedding) == 384 - assert all(isinstance(x, float) for x in doc.embedding) - - @pytest.mark.asyncio - async def test_run_async_custom_batch_size(self, mock_check_valid_model): - docs = [ - Document(content="I love cheese", meta={"topic": "Cuisine"}), - Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}), - ] - - with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.side_effect = mock_embedding_generation - - embedder = HuggingFaceAPIDocumentEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - prefix="prefix ", - suffix=" suffix", - meta_fields_to_embed=["topic"], - embedding_separator=" | ", - batch_size=1, - ) - - result = await embedder.run_async(documents=docs) - - assert mock_embedding_patch.call_count == 2 - - documents_with_embeddings = result["documents"] - - assert isinstance(documents_with_embeddings, list) - assert len(documents_with_embeddings) == len(docs) - for doc in documents_with_embeddings: - assert isinstance(doc, Document) - assert isinstance(doc.embedding, list) - assert len(doc.embedding) == 384 - assert all(isinstance(x, float) for x in doc.embedding) diff --git a/test/components/embedders/test_hugging_face_api_text_embedder.py b/test/components/embedders/test_hugging_face_api_text_embedder.py deleted file mode 100644 index 53e0d41406..0000000000 --- a/test/components/embedders/test_hugging_face_api_text_embedder.py +++ /dev/null @@ -1,259 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -import os -import random -import sys -from unittest.mock import MagicMock, patch - -import pytest -from huggingface_hub.utils import RepositoryNotFoundError -from numpy import array - -from haystack.components.embedders import HuggingFaceAPITextEmbedder -from haystack.utils.auth import Secret -from haystack.utils.hf import HFEmbeddingAPIType - - -@pytest.fixture -def mock_check_valid_model(): - with patch( - "haystack.components.embedders.hugging_face_api_text_embedder.check_valid_model", MagicMock(return_value=None) - ) as mock: - yield mock - - -class TestHuggingFaceAPITextEmbedder: - def test_init_invalid_api_type(self): - with pytest.raises(ValueError): - HuggingFaceAPITextEmbedder(api_type="invalid_api_type", api_params={}) - - def test_init_serverless(self, mock_check_valid_model): - model = "BAAI/bge-small-en-v1.5" - embedder = HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": model} - ) - - assert embedder.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API - assert embedder.api_params == {"model": model} - assert embedder.prefix == "" - assert embedder.suffix == "" - assert embedder.truncate - assert not embedder.normalize - - def test_init_serverless_invalid_model(self, mock_check_valid_model): - mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id", response=MagicMock()) - with pytest.raises(RepositoryNotFoundError): - HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "invalid_model_id"} - ) - - def test_init_serverless_no_model(self): - with pytest.raises(ValueError): - HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"param": "irrelevant"} - ) - - def test_init_tei(self): - url = "https://some_model.com" - - embedder = HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"url": url} - ) - - assert embedder.api_type == HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE - assert embedder.api_params == {"url": url} - assert embedder.prefix == "" - assert embedder.suffix == "" - assert embedder.truncate - assert not embedder.normalize - - def test_init_tei_invalid_url(self): - with pytest.raises(ValueError): - HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"url": "invalid_url"} - ) - - def test_init_tei_no_url(self): - with pytest.raises(ValueError): - HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"param": "irrelevant"} - ) - - def test_to_dict(self, mock_check_valid_model): - embedder = HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - prefix="prefix", - suffix="suffix", - truncate=False, - normalize=True, - ) - - data = embedder.to_dict() - - assert data == { - "type": "haystack.components.embedders.hugging_face_api_text_embedder.HuggingFaceAPITextEmbedder", - "init_parameters": { - "api_type": "serverless_inference_api", - "api_params": {"model": "BAAI/bge-small-en-v1.5"}, - "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, - "prefix": "prefix", - "suffix": "suffix", - "truncate": False, - "normalize": True, - }, - } - - def test_from_dict(self, mock_check_valid_model): - data = { - "type": "haystack.components.embedders.hugging_face_api_text_embedder.HuggingFaceAPITextEmbedder", - "init_parameters": { - "api_type": HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - "api_params": {"model": "BAAI/bge-small-en-v1.5"}, - "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}, - "prefix": "prefix", - "suffix": "suffix", - "truncate": False, - "normalize": True, - }, - } - - embedder = HuggingFaceAPITextEmbedder.from_dict(data) - - assert embedder.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API - assert embedder.api_params == {"model": "BAAI/bge-small-en-v1.5"} - assert embedder.prefix == "prefix" - assert embedder.suffix == "suffix" - assert not embedder.truncate - assert embedder.normalize - - def test_run_wrong_input_format(self, mock_check_valid_model): - embedder = HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "BAAI/bge-small-en-v1.5"} - ) - - list_integers_input = [1, 2, 3] - - with pytest.raises(TypeError): - embedder.run(text=list_integers_input) - - def test_run(self, mock_check_valid_model, caplog): - with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.return_value = array([[random.random() for _ in range(384)]]) - - embedder = HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - prefix="prefix ", - suffix=" suffix", - ) - - result = embedder.run(text="The food was delicious") - - mock_embedding_patch.assert_called_once_with( - text="prefix The food was delicious suffix", truncate=None, normalize=None - ) - - assert len(result["embedding"]) == 384 - assert all(isinstance(x, float) for x in result["embedding"]) - - # Check that warnings about ignoring truncate and normalize are raised - assert len(caplog.records) == 2 - assert "truncate" in caplog.records[0].message - assert "normalize" in caplog.records[1].message - - @pytest.mark.asyncio - async def test_run_async(self, mock_check_valid_model, caplog): - with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.return_value = array([[random.random() for _ in range(384)]]) - - embedder = HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "BAAI/bge-small-en-v1.5"}, - token=Secret.from_token("fake-api-token"), - prefix="prefix ", - suffix=" suffix", - ) - - result = await embedder.run_async(text="The food was delicious") - - mock_embedding_patch.assert_called_once_with( - text="prefix The food was delicious suffix", truncate=None, normalize=None - ) - - assert len(result["embedding"]) == 384 - assert all(isinstance(x, float) for x in result["embedding"]) - - # Check that warnings about ignoring truncate and normalize are raised - assert len(caplog.records) == 2 - assert "truncate" in caplog.records[0].message - assert "normalize" in caplog.records[1].message - - def test_run_wrong_embedding_shape(self, mock_check_valid_model): - # embedding ndim > 2 - with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.return_value = array([[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]]) - - embedder = HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "BAAI/bge-small-en-v1.5"} - ) - - with pytest.raises(ValueError): - embedder.run(text="The food was delicious") - - # embedding ndim == 2 but shape[0] != 1 - with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch: - mock_embedding_patch.return_value = array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]) - - embedder = HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "BAAI/bge-small-en-v1.5"} - ) - - with pytest.raises(ValueError): - embedder.run(text="The food was delicious") - - @pytest.mark.integration - @pytest.mark.slow - @pytest.mark.flaky(reruns=3, reruns_delay=10) - @pytest.mark.skipif( - not os.environ.get("HF_API_TOKEN", None), - reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", - ) - @pytest.mark.skipif(sys.platform != "linux", reason="We only test on Linux to avoid overloading the HF server") - @pytest.mark.xfail(reason="hf-inference is temporarily returning 500s") - def test_live_run_serverless(self): - embedder = HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "sentence-transformers/all-MiniLM-L6-v2"}, - ) - embedder._client.timeout = 10 # we want to fail fast if the server is not responding - result = embedder.run(text="The food was delicious") - - assert len(result["embedding"]) == 384 - assert all(isinstance(x, float) for x in result["embedding"]) - - @pytest.mark.integration - @pytest.mark.asyncio - @pytest.mark.slow - @pytest.mark.flaky(reruns=3, reruns_delay=10) - @pytest.mark.skipif(os.environ.get("HF_API_TOKEN", "") == "", reason="HF_API_TOKEN is not set") - @pytest.mark.skipif(sys.platform != "linux", reason="We only test on Linux to avoid overloading the HF server") - @pytest.mark.xfail(reason="hf-inference is temporarily returning 500s") - async def test_live_run_async_serverless(self): - model_name = "sentence-transformers/all-MiniLM-L6-v2" - - embedder = HuggingFaceAPITextEmbedder( - api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": model_name} - ) - embedder._client.timeout = 10 # we want to fail fast if the server is not responding - - text = "This is a test sentence for embedding." - result = await embedder.run_async(text=text) - - assert "embedding" in result - assert isinstance(result["embedding"], list) - assert all(isinstance(x, float) for x in result["embedding"]) - assert len(result["embedding"]) == 384 # MiniLM-L6-v2 has 384 dimensions diff --git a/test/components/generators/chat/test_hugging_face_api.py b/test/components/generators/chat/test_hugging_face_api.py deleted file mode 100644 index 50a0fcb368..0000000000 --- a/test/components/generators/chat/test_hugging_face_api.py +++ /dev/null @@ -1,1756 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -import os -from datetime import datetime -from typing import Any -from unittest.mock import AsyncMock, MagicMock, Mock, patch - -import pytest -from huggingface_hub import ( - ChatCompletionInputStreamOptions, - ChatCompletionOutput, - ChatCompletionOutputComplete, - ChatCompletionOutputFunctionDefinition, - ChatCompletionOutputMessage, - ChatCompletionOutputToolCall, - ChatCompletionOutputUsage, - ChatCompletionStreamOutput, - ChatCompletionStreamOutputChoice, - ChatCompletionStreamOutputDelta, - ChatCompletionStreamOutputUsage, -) -from huggingface_hub.errors import RepositoryNotFoundError - -from haystack import Pipeline -from haystack.components.generators.chat.hugging_face_api import ( - HuggingFaceAPIChatGenerator, - _convert_chat_completion_stream_output_to_streaming_chunk, - _convert_hfapi_tool_calls, - _convert_tools_to_hfapi_tools, - _resolve_schema_refs, -) -from haystack.dataclasses import ChatMessage, ImageContent, ReasoningContent, StreamingChunk, ToolCall -from haystack.tools import Tool -from haystack.tools.toolset import Toolset -from haystack.utils.auth import Secret -from haystack.utils.hf import HFGenerationAPIType - - -@pytest.fixture -def chat_messages(): - return [ - ChatMessage.from_system("You are a helpful assistant speaking A2 level of English"), - ChatMessage.from_user("Tell me about Berlin"), - ] - - -def get_weather(city: str) -> dict[str, Any]: - weather_info = { - "Berlin": {"weather": "mostly sunny", "temperature": 7, "unit": "celsius"}, - "Paris": {"weather": "mostly cloudy", "temperature": 8, "unit": "celsius"}, - "Rome": {"weather": "sunny", "temperature": 14, "unit": "celsius"}, - } - return weather_info.get(city, {"weather": "unknown", "temperature": 0, "unit": "celsius"}) - - -@pytest.fixture -def tools(): - weather_tool = Tool( - name="weather", - description="useful to determine the weather in a given location", - parameters={"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]}, - function=get_weather, - ) - return [weather_tool] - - -@pytest.fixture -def mock_check_valid_model(): - with patch( - "haystack.components.generators.chat.hugging_face_api.check_valid_model", MagicMock(return_value=None) - ) as mock: - yield mock - - -@pytest.fixture -def mock_chat_completion(): - # https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.example - - with patch("huggingface_hub.InferenceClient.chat_completion", autospec=True) as mock_chat_completion: - completion = ChatCompletionOutput( - choices=[ - ChatCompletionOutputComplete( - finish_reason="eos_token", - index=0, - message=ChatCompletionOutputMessage(content="The capital of France is Paris.", role="assistant"), - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - usage=ChatCompletionOutputUsage(completion_tokens=8, prompt_tokens=17, total_tokens=25), - created=1710498360, - ) - - mock_chat_completion.return_value = completion - yield mock_chat_completion - - -@pytest.fixture -def mock_chat_completion_async(): - with patch("huggingface_hub.AsyncInferenceClient.chat_completion", autospec=True) as mock_chat_completion: - completion = ChatCompletionOutput( - choices=[ - ChatCompletionOutputComplete( - finish_reason="eos_token", - index=0, - message=ChatCompletionOutputMessage(content="The capital of France is Paris.", role="assistant"), - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - usage=ChatCompletionOutputUsage(completion_tokens=8, prompt_tokens=17, total_tokens=25), - created=1710498360, - ) - - # Use AsyncMock to properly mock the async method - mock_chat_completion.return_value = completion - mock_chat_completion.__call__ = AsyncMock(return_value=completion) - - yield mock_chat_completion - - -# used to test serialization of streaming_callback -def streaming_callback_handler(x): - return x - - -class TestHuggingFaceAPIChatGenerator: - def test_init_invalid_api_type(self): - with pytest.raises(ValueError): - HuggingFaceAPIChatGenerator(api_type="invalid_api_type", api_params={}) - - def test_init_serverless(self, mock_check_valid_model): - model = "HuggingFaceH4/zephyr-7b-alpha" - generation_kwargs = {"temperature": 0.6} - stop_words = ["stop"] - streaming_callback = None - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": model}, - token=None, - generation_kwargs=generation_kwargs, - stop_words=stop_words, - streaming_callback=streaming_callback, - ) - - assert generator.api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API - assert generator.api_params == {"model": model} - assert generator.generation_kwargs == {**generation_kwargs, "stop": ["stop"], "max_tokens": 512} - assert generator.streaming_callback == streaming_callback - assert generator.tools is None - - # check that client and async_client are initialized - assert generator._client.model == model - assert generator._async_client.model == model - - def test_init_serverless_with_tools(self, mock_check_valid_model, tools): - model = "HuggingFaceH4/zephyr-7b-alpha" - generation_kwargs = {"temperature": 0.6} - stop_words = ["stop"] - streaming_callback = None - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": model}, - token=None, - generation_kwargs=generation_kwargs, - stop_words=stop_words, - streaming_callback=streaming_callback, - tools=tools, - ) - - assert generator.api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API - assert generator.api_params == {"model": model} - assert generator.generation_kwargs == {**generation_kwargs, "stop": ["stop"], "max_tokens": 512} - assert generator.streaming_callback == streaming_callback - assert generator.tools == tools - - assert generator._client.model == model - assert generator._async_client.model == model - - def test_init_serverless_invalid_model(self, mock_check_valid_model): - mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id", response=MagicMock()) - with pytest.raises(RepositoryNotFoundError): - HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "invalid_model_id"} - ) - - def test_init_serverless_no_model(self): - with pytest.raises(ValueError): - HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"param": "irrelevant"} - ) - - def test_init_tgi(self): - url = "https://some_model.com" - generation_kwargs = {"temperature": 0.6} - stop_words = ["stop"] - streaming_callback = None - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.TEXT_GENERATION_INFERENCE, - api_params={"url": url}, - token=None, - generation_kwargs=generation_kwargs, - stop_words=stop_words, - streaming_callback=streaming_callback, - ) - - assert generator.api_type == HFGenerationAPIType.TEXT_GENERATION_INFERENCE - assert generator.api_params == {"url": url} - assert generator.generation_kwargs == {**generation_kwargs, "stop": ["stop"], "max_tokens": 512} - assert generator.streaming_callback == streaming_callback - assert generator.tools is None - - assert generator._client.model == url - assert generator._async_client.model == url - - def test_init_tgi_invalid_url(self): - with pytest.raises(ValueError): - HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.TEXT_GENERATION_INFERENCE, api_params={"url": "invalid_url"} - ) - - def test_init_tgi_no_url(self): - with pytest.raises(ValueError): - HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.TEXT_GENERATION_INFERENCE, api_params={"param": "irrelevant"} - ) - - def test_init_fail_with_duplicate_tool_names(self, mock_check_valid_model, tools): - duplicate_tools = [tools[0], tools[0]] - with pytest.raises(ValueError): - HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "irrelevant"}, - tools=duplicate_tools, - ) - - def test_init_fail_with_tools_and_streaming(self, mock_check_valid_model, tools): - with pytest.raises(ValueError): - HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "irrelevant"}, - tools=tools, - streaming_callback=streaming_callback_handler, - ) - - def test_to_dict(self, mock_check_valid_model): - tool = Tool(name="name", description="description", parameters={"x": {"type": "string"}}, function=print) - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, - generation_kwargs={"temperature": 0.6}, - stop_words=["stop", "words"], - tools=[tool], - ) - - result = generator.to_dict() - init_params = result["init_parameters"] - - assert init_params["api_type"] == "serverless_inference_api" - assert init_params["api_params"] == {"model": "HuggingFaceH4/zephyr-7b-beta"} - assert init_params["token"] == {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"} - assert init_params["generation_kwargs"] == {"temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512} - assert init_params["streaming_callback"] is None - assert init_params["tools"] == [ - { - "type": "haystack.tools.tool.Tool", - "data": { - "async_function": None, - "description": "description", - "function": "builtins.print", - "inputs_from_state": None, - "name": "name", - "outputs_to_state": None, - "outputs_to_string": None, - "parameters": {"x": {"type": "string"}}, - }, - } - ] - - def test_from_dict(self, mock_check_valid_model): - tool = Tool(name="name", description="description", parameters={"x": {"type": "string"}}, function=print) - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, - token=Secret.from_env_var("ENV_VAR", strict=False), - generation_kwargs={"temperature": 0.6}, - stop_words=["stop", "words"], - tools=[tool], - ) - result = generator.to_dict() - - # now deserialize, call from_dict - generator_2 = HuggingFaceAPIChatGenerator.from_dict(result) - assert generator_2.api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API - assert generator_2.api_params == {"model": "HuggingFaceH4/zephyr-7b-beta"} - assert generator_2.token == Secret.from_env_var("ENV_VAR", strict=False) - assert generator_2.generation_kwargs == {"temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512} - assert generator_2.streaming_callback is None - assert generator_2.tools == [tool] - - def test_serde_in_pipeline(self, mock_check_valid_model): - tool = Tool(name="name", description="description", parameters={"x": {"type": "string"}}, function=print) - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "HuggingFaceH4/zephyr-7b-beta"}, - token=Secret.from_env_var("ENV_VAR", strict=False), - generation_kwargs={"temperature": 0.6}, - stop_words=["stop", "words"], - tools=[tool], - ) - - pipeline = Pipeline() - pipeline.add_component("generator", generator) - - pipeline_dict = pipeline.to_dict() - assert pipeline_dict == { - "metadata": {}, - "max_runs_per_component": 100, - "connection_type_validation": True, - "components": { - "generator": { - "type": "haystack.components.generators.chat.hugging_face_api.HuggingFaceAPIChatGenerator", - "init_parameters": { - "api_type": "serverless_inference_api", - "api_params": {"model": "HuggingFaceH4/zephyr-7b-beta"}, - "token": {"type": "env_var", "env_vars": ["ENV_VAR"], "strict": False}, - "generation_kwargs": {"temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512}, - "streaming_callback": None, - "tools": [ - { - "type": "haystack.tools.tool.Tool", - "data": { - "inputs_from_state": None, - "name": "name", - "outputs_to_state": None, - "outputs_to_string": None, - "description": "description", - "parameters": {"x": {"type": "string"}}, - "function": "builtins.print", - "async_function": None, - }, - } - ], - }, - } - }, - "connections": [], - } - - pipeline_yaml = pipeline.dumps() - - new_pipeline = Pipeline.loads(pipeline_yaml) - assert new_pipeline == pipeline - - def test_run(self, mock_check_valid_model, mock_chat_completion, chat_messages): - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - generation_kwargs={"temperature": 0.6}, - stop_words=["stop", "words"], - streaming_callback=None, - ) - - response = generator.run(messages=chat_messages) - - # check kwargs passed to chat_completion - _, kwargs = mock_chat_completion.call_args - hf_messages = [ - {"role": "system", "content": "You are a helpful assistant speaking A2 level of English"}, - {"role": "user", "content": "Tell me about Berlin"}, - ] - assert kwargs == { - "temperature": 0.6, - "stop": ["stop", "words"], - "max_tokens": 512, - "tools": None, - "messages": hf_messages, - } - - assert isinstance(response, dict) - assert "replies" in response - assert isinstance(response["replies"], list) - assert len(response["replies"]) == 1 - assert [isinstance(reply, ChatMessage) for reply in response["replies"]] - - def test_run_with_string_input(self, mock_check_valid_model, mock_chat_completion): - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - ) - response = generator.run("What's the capital of France?") - - _, kwargs = mock_chat_completion.call_args - assert kwargs["messages"] == [{"role": "user", "content": "What's the capital of France?"}] - - assert isinstance(response["replies"], list) - assert len(response["replies"]) == 1 - assert isinstance(response["replies"][0], ChatMessage) - - def test_run_with_streaming_callback(self, mock_check_valid_model, mock_chat_completion, chat_messages): - streaming_call_count = 0 - - # Define the streaming callback function - def streaming_callback_fn(chunk: StreamingChunk): - nonlocal streaming_call_count - streaming_call_count += 1 - assert isinstance(chunk, StreamingChunk) - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - streaming_callback=streaming_callback_fn, - ) - - # Create a fake streamed response - # self needed here, don't remove - def mock_iter(self): - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(content="The", role="assistant"), - index=0, - finish_reason=None, - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason="length" - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - - mock_response = Mock(__iter__=mock_iter) - mock_chat_completion.return_value = mock_response - - # Generate text response with streaming callback - response = generator.run(chat_messages) - - # check kwargs passed to text_generation - _, kwargs = mock_chat_completion.call_args - assert kwargs == { - "stop": [], - "stream": True, - "max_tokens": 512, - "stream_options": ChatCompletionInputStreamOptions(include_usage=True), - } - - # Assert that the streaming callback was called twice - assert streaming_call_count == 2 - - # Assert that the response contains the generated replies - assert "replies" in response - assert isinstance(response["replies"], list) - assert len(response["replies"]) > 0 - assert [isinstance(reply, ChatMessage) for reply in response["replies"]] - - def test_run_with_streaming_callback_in_run_method( - self, mock_check_valid_model, mock_chat_completion, chat_messages - ): - streaming_call_count = 0 - - # Define the streaming callback function - def streaming_callback_fn(chunk: StreamingChunk): - nonlocal streaming_call_count - streaming_call_count += 1 - assert isinstance(chunk, StreamingChunk) - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - ) - - # Create a fake streamed response - # self needed here, don't remove - def mock_iter(self): - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(content="The", role="assistant"), - index=0, - finish_reason=None, - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason="length" - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - - mock_response = Mock(__iter__=mock_iter) - mock_chat_completion.return_value = mock_response - - # Generate text response with streaming callback - response = generator.run(chat_messages, streaming_callback=streaming_callback_fn) - - # check kwargs passed to text_generation - _, kwargs = mock_chat_completion.call_args - assert kwargs == { - "stop": [], - "stream": True, - "max_tokens": 512, - "stream_options": ChatCompletionInputStreamOptions(include_usage=True), - } - - # Assert that the streaming callback was called twice - assert streaming_call_count == 2 - - # Assert that the response contains the generated replies - assert "replies" in response - assert isinstance(response["replies"], list) - assert len(response["replies"]) > 0 - assert [isinstance(reply, ChatMessage) for reply in response["replies"]] - - def test_run_fail_with_tools_and_streaming(self, tools, mock_check_valid_model): - component = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - streaming_callback=streaming_callback_handler, - ) - - with pytest.raises(ValueError): - message = ChatMessage.from_user("irrelevant") - component.run([message], tools=tools) - - def test_run_with_tools(self, mock_check_valid_model, tools): - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-3.1-70B-Instruct"}, - tools=tools, - ) - - with patch("huggingface_hub.InferenceClient.chat_completion", autospec=True) as mock_chat_completion: - completion = ChatCompletionOutput( - choices=[ - ChatCompletionOutputComplete( - finish_reason="stop", - index=0, - message=ChatCompletionOutputMessage( - role="assistant", - content=None, - tool_calls=[ - ChatCompletionOutputToolCall( - function=ChatCompletionOutputFunctionDefinition( - arguments={"city": "Paris"}, name="weather", description=None - ), - id="0", - type="function", - ) - ], - ), - logprobs=None, - ) - ], - created=1729074760, - id="", - model="meta-llama/Llama-3.1-70B-Instruct", - system_fingerprint="2.3.2-dev0-sha-28bb7ae", - usage=ChatCompletionOutputUsage(completion_tokens=30, prompt_tokens=426, total_tokens=456), - ) - mock_chat_completion.return_value = completion - - messages = [ChatMessage.from_user("What is the weather in Paris?")] - response = generator.run(messages=messages) - - assert isinstance(response, dict) - assert "replies" in response - assert isinstance(response["replies"], list) - assert len(response["replies"]) == 1 - assert [isinstance(reply, ChatMessage) for reply in response["replies"]] - assert response["replies"][0].tool_calls[0].tool_name == "weather" - assert response["replies"][0].tool_calls[0].arguments == {"city": "Paris"} - assert response["replies"][0].tool_calls[0].id == "0" - assert response["replies"][0].meta == { - "finish_reason": "tool_calls", - "index": 0, - "model": "meta-llama/Llama-3.1-70B-Instruct", - "usage": {"completion_tokens": 30, "prompt_tokens": 426}, - } - - def test_convert_hfapi_tool_calls_empty(self): - hfapi_tool_calls = None - tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) - assert len(tool_calls) == 0 - - hfapi_tool_calls = [] - tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) - assert len(tool_calls) == 0 - - def test_convert_hfapi_tool_calls_dict_arguments(self): - hfapi_tool_calls = [ - ChatCompletionOutputToolCall( - function=ChatCompletionOutputFunctionDefinition( - arguments={"city": "Paris"}, name="weather", description=None - ), - id="0", - type="function", - ) - ] - tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) - assert len(tool_calls) == 1 - assert tool_calls[0].tool_name == "weather" - assert tool_calls[0].arguments == {"city": "Paris"} - assert tool_calls[0].id == "0" - - def test_convert_hfapi_tool_calls_str_arguments(self): - hfapi_tool_calls = [ - ChatCompletionOutputToolCall( - function=ChatCompletionOutputFunctionDefinition( - arguments='{"city": "Paris"}', name="weather", description=None - ), - id="0", - type="function", - ) - ] - tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) - assert len(tool_calls) == 1 - assert tool_calls[0].tool_name == "weather" - assert tool_calls[0].arguments == {"city": "Paris"} - assert tool_calls[0].id == "0" - - def test_convert_hfapi_tool_calls_invalid_str_arguments(self): - hfapi_tool_calls = [ - ChatCompletionOutputToolCall( - function=ChatCompletionOutputFunctionDefinition( - arguments="not a valid JSON string", name="weather", description=None - ), - id="0", - type="function", - ) - ] - tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) - assert len(tool_calls) == 0 - - def test_convert_hfapi_tool_calls_invalid_type_arguments(self): - hfapi_tool_calls = [ - ChatCompletionOutputToolCall( - function=ChatCompletionOutputFunctionDefinition( - arguments=["this", "is", "a", "list"], name="weather", description=None - ), - id="0", - type="function", - ) - ] - tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls) - assert len(tool_calls) == 0 - - @pytest.mark.parametrize( - "hf_stream_output, expected_stream_chunk, dummy_previous_chunks", - [ - ( - ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(role="assistant", content=" Paris"), index=0 - ) - ], - created=1748339326, - id="", - model="microsoft/Phi-3.5-mini-instruct", - system_fingerprint="3.2.1-sha-4d28897", - ), - StreamingChunk( - content=" Paris", - meta={ - "received_at": "2025-05-27T12:14:28.228852", - "model": "microsoft/Phi-3.5-mini-instruct", - "finish_reason": None, - }, - index=0, - start=True, - ), - [], - ), - ( - ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(role="assistant", content=""), - index=0, - finish_reason="stop", - ) - ], - created=1748339326, - id="", - model="microsoft/Phi-3.5-mini-instruct", - system_fingerprint="3.2.1-sha-4d28897", - ), - StreamingChunk( - content="", - meta={ - "received_at": "2025-05-27T12:14:28.228852", - "model": "microsoft/Phi-3.5-mini-instruct", - "finish_reason": "stop", - }, - finish_reason="stop", - ), - [0], - ), - ( - ChatCompletionStreamOutput( - choices=[], - created=1748339326, - id="", - model="microsoft/Phi-3.5-mini-instruct", - system_fingerprint="3.2.1-sha-4d28897", - usage=ChatCompletionStreamOutputUsage(completion_tokens=2, prompt_tokens=21, total_tokens=23), - ), - StreamingChunk( - content="", - meta={ - "received_at": "2025-05-27T12:14:28.228852", - "model": "microsoft/Phi-3.5-mini-instruct", - "usage": {"completion_tokens": 2, "prompt_tokens": 21}, - }, - ), - [0, 1], - ), - ], - ) - def test_convert_chat_completion_stream_output_to_streaming_chunk( - self, hf_stream_output, expected_stream_chunk, dummy_previous_chunks - ): - converted_stream_chunk = _convert_chat_completion_stream_output_to_streaming_chunk( - chunk=hf_stream_output, previous_chunks=dummy_previous_chunks - ) - # Remove timestamp from comparison since it's always the current time - converted_stream_chunk.meta.pop("received_at", None) - expected_stream_chunk.meta.pop("received_at", None) - assert converted_stream_chunk == expected_stream_chunk - - @pytest.mark.integration - @pytest.mark.slow - @pytest.mark.skipif( - not os.environ.get("HF_API_TOKEN", None), - reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", - ) - @pytest.mark.flaky(reruns=2, reruns_delay=10) - def test_live_run_serverless(self): - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "Qwen/Qwen2.5-7B-Instruct", "provider": "together"}, - generation_kwargs={"max_tokens": 20}, - ) - - # No need for instruction tokens here since we use the chat_completion endpoint which handles the chat - # templating for us. - messages = [ - ChatMessage.from_user("What is the capital of France? Be concise only provide the capital, nothing else.") - ] - response = generator.run(messages=messages) - - assert "replies" in response - assert isinstance(response["replies"], list) - assert len(response["replies"]) > 0 - assert [isinstance(reply, ChatMessage) for reply in response["replies"]] - assert response["replies"][0].text is not None - meta = response["replies"][0].meta - assert "usage" in meta - assert "prompt_tokens" in meta["usage"] - assert meta["usage"]["prompt_tokens"] > 0 - assert "completion_tokens" in meta["usage"] - assert meta["usage"]["completion_tokens"] > 0 - assert meta["model"] == "Qwen/Qwen2.5-7B-Instruct" - assert meta["finish_reason"] is not None - - @pytest.mark.integration - @pytest.mark.slow - @pytest.mark.skipif( - not os.environ.get("HF_API_TOKEN", None), - reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", - ) - @pytest.mark.flaky(reruns=2, reruns_delay=10) - def test_live_run_serverless_streaming(self): - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "Qwen/Qwen2.5-7B-Instruct", "provider": "together"}, - generation_kwargs={"max_tokens": 20}, - streaming_callback=streaming_callback_handler, - ) - - # No need for instruction tokens here since we use the chat_completion endpoint which handles the chat - # templating for us. - messages = [ - ChatMessage.from_user("What is the capital of France? Be concise only provide the capital, nothing else.") - ] - response = generator.run(messages=messages) - - print(response) - - assert "replies" in response - assert isinstance(response["replies"], list) - assert len(response["replies"]) > 0 - assert [isinstance(reply, ChatMessage) for reply in response["replies"]] - assert response["replies"][0].text is not None - - response_meta = response["replies"][0].meta - assert "completion_start_time" in response_meta - assert datetime.fromisoformat(response_meta["completion_start_time"]) <= datetime.now() - assert "usage" in response_meta - assert "prompt_tokens" in response_meta["usage"] - assert response_meta["usage"]["prompt_tokens"] >= 0 - assert "completion_tokens" in response_meta["usage"] - assert response_meta["usage"]["completion_tokens"] >= 0 - # internally, Together calls this "Qwen/Qwen2.5-7B-Instruct-Turbo" - assert "Qwen/Qwen2.5-7B-Instruct" in response_meta["model"] - assert response_meta["finish_reason"] is not None - - @pytest.mark.integration - @pytest.mark.slow - @pytest.mark.skipif( - not os.environ.get("HF_API_TOKEN", None), - reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", - ) - def test_live_run_with_tools(self, tools): - """ - We test the round trip: generate tool call, pass tool message, generate response. - - The model used here is not gated and kept in a warm state. - """ - - chat_messages = [ChatMessage.from_user("What's the weather like in Paris?")] - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "Qwen/Qwen3.5-9B", "provider": "together"}, - generation_kwargs={"temperature": 0.5, "extra_body": {"chat_template_kwargs": {"enable_thinking": False}}}, - ) - - results = generator.run(chat_messages, tools=tools) - assert len(results["replies"]) == 1 - message = results["replies"][0] - - assert message.tool_calls - tool_call = message.tool_call - assert isinstance(tool_call, ToolCall) - assert tool_call.tool_name == "weather" - assert "city" in tool_call.arguments - assert "Paris" in tool_call.arguments["city"] - assert message.meta["finish_reason"] == "tool_calls" - - new_messages = chat_messages + [message, ChatMessage.from_tool(tool_result="22° C", origin=tool_call)] - - # the model tends to make tool calls if provided with tools, so we don't pass them here - results = generator.run(new_messages, generation_kwargs={"max_tokens": 50}) - - assert len(results["replies"]) == 1 - final_message = results["replies"][0] - assert not final_message.tool_calls - assert len(final_message.text) > 0 - assert "paris" in final_message.text.lower() and "22" in final_message.text - - @pytest.mark.integration - @pytest.mark.slow - @pytest.mark.skipif( - not os.environ.get("HF_API_TOKEN", None), - reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", - ) - def test_live_run_multimodal(self, test_files_path): - image_path = test_files_path / "images" / "apple.jpg" - # Resize the image to keep this test fast - image_content = ImageContent.from_file_path(file_path=image_path, size=(100, 100)) - messages = [ChatMessage.from_user(content_parts=["What does this image show? Max 5 words", image_content])] - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "Qwen/Qwen3.5-9B", "provider": "together"}, - generation_kwargs={"max_tokens": 20, "extra_body": {"chat_template_kwargs": {"enable_thinking": False}}}, - ) - - response = generator.run(messages=messages) - - assert "replies" in response - assert isinstance(response["replies"], list) - assert len(response["replies"]) > 0 - message = response["replies"][0] - assert message.text - assert len(message.text) > 0 - assert any(word in message.text.lower() for word in ["apple", "fruit", "red"]) - - @pytest.mark.asyncio - async def test_run_async(self, mock_check_valid_model, mock_chat_completion_async, chat_messages): - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - generation_kwargs={"temperature": 0.6}, - stop_words=["stop", "words"], - streaming_callback=None, - ) - - response = await generator.run_async(messages=chat_messages) - - # check kwargs passed to chat_completion - _, kwargs = mock_chat_completion_async.call_args - hf_messages = [ - {"role": "system", "content": "You are a helpful assistant speaking A2 level of English"}, - {"role": "user", "content": "Tell me about Berlin"}, - ] - assert kwargs == { - "temperature": 0.6, - "stop": ["stop", "words"], - "max_tokens": 512, - "tools": None, - "messages": hf_messages, - } - - assert isinstance(response, dict) - assert "replies" in response - assert isinstance(response["replies"], list) - assert len(response["replies"]) == 1 - assert [isinstance(reply, ChatMessage) for reply in response["replies"]] - - async def test_run_async_with_string_input(self, mock_check_valid_model, mock_chat_completion_async): - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - ) - response = await generator.run_async("What's the capital of France?") - - _, kwargs = mock_chat_completion_async.call_args - assert kwargs["messages"] == [{"role": "user", "content": "What's the capital of France?"}] - - assert isinstance(response["replies"], list) - assert len(response["replies"]) == 1 - assert isinstance(response["replies"][0], ChatMessage) - - @pytest.mark.asyncio - async def test_run_async_with_streaming(self, mock_check_valid_model, mock_chat_completion_async, chat_messages): - streaming_call_count = 0 - - async def streaming_callback_fn(chunk: StreamingChunk): - nonlocal streaming_call_count - streaming_call_count += 1 - assert isinstance(chunk, StreamingChunk) - - # Create a fake streamed response - async def mock_aiter(self): - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(content="The", role="assistant"), - index=0, - finish_reason=None, - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason="length" - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - - mock_response = Mock(__aiter__=mock_aiter) - mock_chat_completion_async.return_value = mock_response - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - streaming_callback=streaming_callback_fn, - ) - - response = await generator.run_async(messages=chat_messages) - - # check kwargs passed to chat_completion - _, kwargs = mock_chat_completion_async.call_args - assert kwargs == { - "stop": [], - "stream": True, - "max_tokens": 512, - "stream_options": ChatCompletionInputStreamOptions(include_usage=True), - } - - # Assert that the streaming callback was called twice - assert streaming_call_count == 2 - - # Assert that the response contains the generated replies - assert "replies" in response - assert isinstance(response["replies"], list) - assert len(response["replies"]) > 0 - assert [isinstance(reply, ChatMessage) for reply in response["replies"]] - - @pytest.mark.asyncio - async def test_run_async_with_tools(self, tools, mock_check_valid_model): - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-3.1-70B-Instruct"}, - tools=tools, - ) - - with patch("huggingface_hub.AsyncInferenceClient.chat_completion", autospec=True) as mock_chat_completion_async: - completion = ChatCompletionOutput( - choices=[ - ChatCompletionOutputComplete( - finish_reason="stop", - index=0, - message=ChatCompletionOutputMessage( - role="assistant", - content=None, - tool_calls=[ - ChatCompletionOutputToolCall( - function=ChatCompletionOutputFunctionDefinition( - arguments={"city": "Paris"}, name="weather", description=None - ), - id="0", - type="function", - ) - ], - ), - logprobs=None, - ) - ], - created=1729074760, - id="", - model="meta-llama/Llama-3.1-70B-Instruct", - system_fingerprint="2.3.2-dev0-sha-28bb7ae", - usage=ChatCompletionOutputUsage(completion_tokens=30, prompt_tokens=426, total_tokens=456), - ) - mock_chat_completion_async.return_value = completion - - messages = [ChatMessage.from_user("What is the weather in Paris?")] - response = await generator.run_async(messages=messages) - - assert isinstance(response, dict) - assert "replies" in response - assert isinstance(response["replies"], list) - assert len(response["replies"]) == 1 - assert [isinstance(reply, ChatMessage) for reply in response["replies"]] - assert response["replies"][0].tool_calls[0].tool_name == "weather" - assert response["replies"][0].tool_calls[0].arguments == {"city": "Paris"} - assert response["replies"][0].tool_calls[0].id == "0" - assert response["replies"][0].meta == { - "finish_reason": "tool_calls", - "index": 0, - "model": "meta-llama/Llama-3.1-70B-Instruct", - "usage": {"completion_tokens": 30, "prompt_tokens": 426}, - } - - @pytest.mark.integration - @pytest.mark.slow - @pytest.mark.skipif( - not os.environ.get("HF_API_TOKEN", None), - reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", - ) - @pytest.mark.flaky(reruns=2, reruns_delay=10) - @pytest.mark.asyncio - async def test_live_run_async_serverless(self): - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "Qwen/Qwen2.5-7B-Instruct", "provider": "together"}, - generation_kwargs={"max_tokens": 20}, - ) - - messages = [ - ChatMessage.from_user("What is the capital of France? Be concise only provide the capital, nothing else.") - ] - try: - response = await generator.run_async(messages=messages) - - assert "replies" in response - assert isinstance(response["replies"], list) - assert len(response["replies"]) > 0 - assert [isinstance(reply, ChatMessage) for reply in response["replies"]] - assert response["replies"][0].text is not None - - meta = response["replies"][0].meta - assert "usage" in meta - assert "prompt_tokens" in meta["usage"] - assert meta["usage"]["prompt_tokens"] > 0 - assert "completion_tokens" in meta["usage"] - assert meta["usage"]["completion_tokens"] > 0 - assert meta["model"] == "Qwen/Qwen2.5-7B-Instruct" - assert meta["finish_reason"] is not None - finally: - await generator._async_client.close() - - @pytest.mark.integration - @pytest.mark.slow - @pytest.mark.skipif( - not os.environ.get("HF_API_TOKEN", None), - reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.", - ) - @pytest.mark.flaky(reruns=2, reruns_delay=10) - def test_live_run_multi_turn_with_reasoning_model(self): - """ - Test multi-turn conversation with a reasoning model. - - This test verifies that: - 1. Reasoning content is captured from the model's response - 2. When the assistant message (with reasoning) is sent back in a multi-turn conversation, - the API call succeeds (reasoning is dropped during conversion since HF API doesn't support it) - """ - # Note: Using a model that supports reasoning AND a provider that actually follows the spec defined in - # huggingface-hub. Reasoning content especially seems to be non-standard across providers and is either left - # in the main response or put in a new field that is not part of the official API. - # One combo that does respect the spec is together + openai/gpt-oss-20b. - # together + openai/gpt-oss-20b actually uses the expected reasoning field in the response - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - # We use together + openai/gpt-oss-20b since it actually returns reasoning content in the expected field - api_params={"model": "openai/gpt-oss-20b", "provider": "together"}, - generation_kwargs={"max_tokens": 300}, - ) - - # First turn: ask a question - messages = [ChatMessage.from_user("What is 2 + 2? Answer briefly.")] - response = generator.run(messages=messages) - - assert "replies" in response - assert len(response["replies"]) > 0 - first_reply = response["replies"][0] - assert first_reply.text is not None - assert first_reply.reasoning is not None - - # Second turn: send a follow-up including the assistant's previous response - # This tests that convert_message_to_hf_format properly handles messages - # that may contain ReasoningContent (it should skip it) - follow_up_messages = [ - ChatMessage.from_user("What is 2 + 2? Answer briefly."), - first_reply, # Include the assistant's response with reasoning - ChatMessage.from_user("Now what is 3 + 3? Answer briefly."), - ] - follow_up_response = generator.run(messages=follow_up_messages) - - # Verify the second turn succeeds - assert "replies" in follow_up_response - assert len(follow_up_response["replies"]) > 0 - assert follow_up_response["replies"][0].text is not None - assert follow_up_response["replies"][0].reasoning is not None - - def test_hugging_face_api_generator_with_toolset_initialization(self, mock_check_valid_model, tools): - """Test that the HuggingFaceAPIChatGenerator can be initialized with a Toolset.""" - toolset = Toolset(tools) - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "irrelevant"}, tools=toolset - ) - assert generator.tools == toolset - - def test_from_dict_with_toolset(self, mock_check_valid_model, tools): - """Test that the HuggingFaceAPIChatGenerator can be deserialized from a dictionary with a Toolset.""" - toolset = Toolset(tools) - component = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "irrelevant"}, tools=toolset - ) - data = component.to_dict() - - deserialized_component = HuggingFaceAPIChatGenerator.from_dict(data) - - assert isinstance(deserialized_component.tools, Toolset) - assert len(deserialized_component.tools) == len(tools) - assert all(isinstance(tool, Tool) for tool in deserialized_component.tools) - - def test_to_dict_with_toolset(self, mock_check_valid_model, tools): - """Test that the HuggingFaceAPIChatGenerator can be serialized to a dictionary with a Toolset.""" - toolset = Toolset(tools[:1]) - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "irrelevant"}, tools=toolset - ) - data = generator.to_dict() - - expected_tools_data = { - "type": "haystack.tools.toolset.Toolset", - "data": { - "tools": [ - { - "type": "haystack.tools.tool.Tool", - "data": { - "name": "weather", - "description": "useful to determine the weather in a given location", - "parameters": { - "type": "object", - "properties": {"city": {"type": "string"}}, - "required": ["city"], - }, - "function": "generators.chat.test_hugging_face_api.get_weather", - "async_function": None, - "outputs_to_string": None, - "inputs_from_state": None, - "outputs_to_state": None, - }, - } - ] - }, - } - assert data["init_parameters"]["tools"] == expected_tools_data - - def test_convert_tools_to_hfapi_tools(self): - assert _convert_tools_to_hfapi_tools(None) is None - assert _convert_tools_to_hfapi_tools([]) is None - - tool = Tool( - name="weather", - description="useful to determine the weather in a given location", - parameters={"city": {"type": "string"}}, - function=get_weather, - ) - hf_tools = _convert_tools_to_hfapi_tools([tool]) - assert len(hf_tools) == 1 - assert hf_tools[0].type == "function" - assert hf_tools[0].function.name == "weather" - assert hf_tools[0].function.description == "useful to determine the weather in a given location" - assert hf_tools[0].function.parameters == {"city": {"type": "string"}} - - def test_convert_tools_to_hfapi_tools_legacy(self): - # this satisfies the check hasattr(ChatCompletionInputFunctionDefinition, "arguments") - mock_class = MagicMock() - - with patch( - "haystack.components.generators.chat.hugging_face_api.ChatCompletionInputFunctionDefinition", mock_class - ): - tool = Tool( - name="weather", - description="useful to determine the weather in a given location", - parameters={"city": {"type": "string"}}, - function=get_weather, - ) - _convert_tools_to_hfapi_tools([tool]) - - mock_class.assert_called_once_with( - name="weather", - arguments={"city": {"type": "string"}}, - description="useful to determine the weather in a given location", - ) - - def test_warm_up_with_tools(self, mock_check_valid_model): - """Test that warm_up() calls warm_up on tools and is idempotent.""" - - # Create a mock tool that tracks if warm_up() was called - class MockTool(Tool): - warm_up_call_count = 0 # Class variable to track calls - - def __init__(self): - super().__init__( - name="mock_tool", - description="A mock tool for testing", - parameters={"x": {"type": "string"}}, - function=lambda x: x, - ) - - def warm_up(self): - MockTool.warm_up_call_count += 1 - - # Reset the class variable before test - MockTool.warm_up_call_count = 0 - mock_tool = MockTool() - - # Create HuggingFaceAPIChatGenerator with the mock tool - component = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "HuggingFaceH4/zephyr-7b-alpha"}, - tools=[mock_tool], - ) - - # Verify initial state - warm_up not called yet - assert MockTool.warm_up_call_count == 0 - assert not component._is_warmed_up - - # Call warm_up() on the generator - component.warm_up() - - # Assert that the tool's warm_up() was called - assert MockTool.warm_up_call_count == 1 - assert component._is_warmed_up - - # Call warm_up() again and verify it's idempotent (only warms up once) - component.warm_up() - - # The tool's warm_up should still only have been called once - assert MockTool.warm_up_call_count == 1 - assert component._is_warmed_up - - def test_warm_up_with_no_tools(self, mock_check_valid_model): - """Test that warm_up() works when no tools are provided.""" - component = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "HuggingFaceH4/zephyr-7b-alpha"} - ) - - # Verify initial state - assert not component._is_warmed_up - assert component.tools is None - - # Call warm_up() - should not raise an error - component.warm_up() - - # Verify the component is warmed up - assert component._is_warmed_up - - # Call warm_up() again - should be idempotent - component.warm_up() - assert component._is_warmed_up - - def test_warm_up_with_multiple_tools(self, mock_check_valid_model): - """Test that warm_up() works with multiple tools.""" - # Track warm_up calls - warm_up_calls = [] - - class MockTool(Tool): - def __init__(self, tool_name): - super().__init__( - name=tool_name, - description=f"Mock tool {tool_name}", - parameters={"type": "object", "properties": {"x": {"type": "string"}}, "required": ["x"]}, - function=lambda x: f"{tool_name} result: {x}", - ) - - def warm_up(self): - warm_up_calls.append(self.name) - - mock_tool1 = MockTool("tool1") - mock_tool2 = MockTool("tool2") - - # Use a LIST of tools, not a Toolset - component = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "HuggingFaceH4/zephyr-7b-alpha"}, - tools=[mock_tool1, mock_tool2], - ) - - # Call warm_up() - component.warm_up() - - # Assert that both tools' warm_up() were called - assert "tool1" in warm_up_calls - assert "tool2" in warm_up_calls - assert component._is_warmed_up - - # Track count - call_count = len(warm_up_calls) - - # Verify idempotency - component.warm_up() - assert len(warm_up_calls) == call_count - - def test_run_with_reasoning_non_streaming(self, mock_check_valid_model, chat_messages): - """Test that reasoning content is correctly extracted from non-streaming responses.""" - with patch("huggingface_hub.InferenceClient.chat_completion", autospec=True) as mock_chat_completion: - reasoning_text = "Let me think about this. France is a country in Europe. Its capital city is Paris." - completion = ChatCompletionOutput( - choices=[ - ChatCompletionOutputComplete( - finish_reason="eos_token", - index=0, - message=ChatCompletionOutputMessage( - content="The capital of France is Paris.", role="assistant", reasoning=reasoning_text - ), - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - usage=ChatCompletionOutputUsage(completion_tokens=20, prompt_tokens=17, total_tokens=37), - created=1710498360, - ) - mock_chat_completion.return_value = completion - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - ) - - response = generator.run(chat_messages) - - assert "replies" in response - assert len(response["replies"]) == 1 - reply = response["replies"][0] - assert reply.text == "The capital of France is Paris." - assert reply.reasoning is not None - assert isinstance(reply.reasoning, ReasoningContent) - assert reply.reasoning.reasoning_text == reasoning_text - - def test_run_without_reasoning_non_streaming(self, mock_check_valid_model, mock_chat_completion, chat_messages): - """Test that responses without reasoning work correctly (backward compatibility).""" - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - ) - - response = generator.run(chat_messages) - - assert "replies" in response - assert len(response["replies"]) == 1 - reply = response["replies"][0] - assert reply.text == "The capital of France is Paris." - assert reply.reasoning is None - - def test_run_with_reasoning_streaming(self, mock_check_valid_model, chat_messages): - """Test that reasoning content is correctly extracted from streaming responses.""" - streaming_chunks_received = [] - - def streaming_callback_fn(chunk: StreamingChunk): - streaming_chunks_received.append(chunk) - - with patch("huggingface_hub.InferenceClient.chat_completion", autospec=True) as mock_chat_completion: - # Create a fake streamed response with reasoning - def mock_iter(self): - # First chunk with reasoning - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta( - role="assistant", content=None, reasoning="Let me think..." - ), - index=0, - finish_reason=None, - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - # Second chunk with more reasoning - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta( - role=None, content=None, reasoning=" The capital of France is Paris." - ), - index=0, - finish_reason=None, - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - # Third chunk with actual content - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(role=None, content="Paris", reasoning=None), - index=0, - finish_reason=None, - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - # Final chunk with finish reason - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(role=None, content=None, reasoning=None), - index=0, - finish_reason="stop", - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - - mock_response = Mock(__iter__=mock_iter) - mock_chat_completion.return_value = mock_response - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - streaming_callback=streaming_callback_fn, - ) - - response = generator.run(chat_messages) - - # Check streaming chunks received with reasoning - assert len(streaming_chunks_received) == 4 - assert streaming_chunks_received[0].reasoning is not None - assert streaming_chunks_received[0].reasoning.reasoning_text == "Let me think..." - assert streaming_chunks_received[1].reasoning is not None - assert streaming_chunks_received[1].reasoning.reasoning_text == " The capital of France is Paris." - - # Check final message - assert "replies" in response - assert len(response["replies"]) == 1 - reply = response["replies"][0] - assert reply.text == "Paris" - assert reply.reasoning is not None - assert isinstance(reply.reasoning, ReasoningContent) - assert reply.reasoning.reasoning_text == "Let me think... The capital of France is Paris." - - @pytest.mark.asyncio - async def test_run_async_with_reasoning_non_streaming(self, mock_check_valid_model, chat_messages): - """Test that reasoning content is correctly extracted from async non-streaming responses.""" - with patch( - "huggingface_hub.AsyncInferenceClient.chat_completion", new_callable=AsyncMock - ) as mock_chat_completion: - completion = ChatCompletionOutput( - choices=[ - ChatCompletionOutputComplete( - finish_reason="eos_token", - index=0, - message=ChatCompletionOutputMessage( - content="The capital of France is Paris.", - role="assistant", - reasoning="Let me reason about this question step by step.", - ), - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - usage=ChatCompletionOutputUsage(completion_tokens=20, prompt_tokens=17, total_tokens=37), - created=1710498360, - ) - mock_chat_completion.return_value = completion - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - ) - - response = await generator.run_async(chat_messages) - - assert "replies" in response - assert len(response["replies"]) == 1 - reply = response["replies"][0] - assert reply.text == "The capital of France is Paris." - assert reply.reasoning is not None - assert isinstance(reply.reasoning, ReasoningContent) - assert reply.reasoning.reasoning_text == "Let me reason about this question step by step." - - @pytest.mark.asyncio - async def test_run_async_with_reasoning_streaming(self, mock_check_valid_model, chat_messages): - """Test that reasoning content is correctly extracted from async streaming responses.""" - streaming_chunks_received = [] - - async def streaming_callback_fn(chunk: StreamingChunk): - streaming_chunks_received.append(chunk) - - with patch( - "huggingface_hub.AsyncInferenceClient.chat_completion", new_callable=AsyncMock - ) as mock_chat_completion: - # Create async iterable for streaming - async def mock_aiter(): - # First chunk with reasoning - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta( - role="assistant", content=None, reasoning="Thinking..." - ), - index=0, - finish_reason=None, - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - # Second chunk with content - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(role=None, content="Paris", reasoning=None), - index=0, - finish_reason=None, - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - # Final chunk - yield ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(role=None, content=None, reasoning=None), - index=0, - finish_reason="stop", - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - - mock_chat_completion.return_value = mock_aiter() - - generator = HuggingFaceAPIChatGenerator( - api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, - api_params={"model": "meta-llama/Llama-2-13b-chat-hf"}, - streaming_callback=streaming_callback_fn, - ) - - response = await generator.run_async(chat_messages) - - # Check streaming chunks - assert len(streaming_chunks_received) == 3 - assert streaming_chunks_received[0].reasoning is not None - assert streaming_chunks_received[0].reasoning.reasoning_text == "Thinking..." - - # Check final message - assert "replies" in response - assert len(response["replies"]) == 1 - reply = response["replies"][0] - assert reply.text == "Paris" - assert reply.reasoning is not None - assert isinstance(reply.reasoning, ReasoningContent) - assert reply.reasoning.reasoning_text == "Thinking..." - - def test_convert_chat_completion_stream_output_to_streaming_chunk_with_reasoning(self): - """Test that reasoning is correctly extracted from streaming chunks.""" - # In streaming mode, reasoning and content come in separate chunks - chunk = ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta( - role="assistant", content=None, reasoning="Let me think about this." - ), - index=0, - finish_reason=None, - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - - streaming_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(chunk=chunk, previous_chunks=[]) - - assert streaming_chunk.content == "" - assert streaming_chunk.reasoning is not None - assert isinstance(streaming_chunk.reasoning, ReasoningContent) - assert streaming_chunk.reasoning.reasoning_text == "Let me think about this." - - def test_convert_chat_completion_stream_output_to_streaming_chunk_without_reasoning(self): - """Test that chunks without reasoning still work correctly.""" - chunk = ChatCompletionStreamOutput( - choices=[ - ChatCompletionStreamOutputChoice( - delta=ChatCompletionStreamOutputDelta(role="assistant", content="Hello"), - index=0, - finish_reason=None, - ) - ], - id="some_id", - model="some_model", - system_fingerprint="some_fingerprint", - created=1710498504, - ) - - streaming_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(chunk=chunk, previous_chunks=[]) - - assert streaming_chunk.content == "Hello" - assert streaming_chunk.reasoning is None - - def test_resolve_schema_refs_no_defs(self): - """Schema without $defs is returned as-is.""" - schema = {"type": "object", "properties": {"name": {"type": "string"}}} - assert _resolve_schema_refs(schema) == schema - - def test_resolve_schema_refs_expands_defs(self): - """Schema with $defs and $ref is expanded correctly.""" - schema = { - "$defs": { - "User": { - "type": "object", - "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}, - "required": ["name"], - } - }, - "type": "object", - "properties": {"user": {"$ref": "#/$defs/User"}}, - "required": ["user"], - } - resolved = _resolve_schema_refs(schema) - assert "$defs" not in resolved - assert "$ref" not in resolved["properties"]["user"] - assert resolved["properties"]["user"]["type"] == "object" - assert resolved["properties"]["user"]["properties"]["name"] == {"type": "string"} - - def test_resolve_schema_refs_nested_refs(self): - """Schema with nested $ref references is expanded correctly.""" - schema = { - "$defs": { - "Address": {"type": "object", "properties": {"street": {"type": "string"}}}, - "User": { - "type": "object", - "properties": {"name": {"type": "string"}, "address": {"$ref": "#/$defs/Address"}}, - }, - }, - "type": "object", - "properties": {"user": {"$ref": "#/$defs/User"}}, - } - resolved = _resolve_schema_refs(schema) - assert "$defs" not in resolved - user = resolved["properties"]["user"] - assert user["properties"]["address"]["type"] == "object" - assert user["properties"]["address"]["properties"]["street"] == {"type": "string"} - - def test_convert_tools_to_hfapi_tools_resolves_defs(self): - """Tool schemas with $defs are resolved before passing to HF API.""" - tool = Tool( - name="get_user", - description="Get user info", - parameters={ - "$defs": {"User": {"type": "object", "properties": {"name": {"type": "string"}}}}, - "type": "object", - "properties": {"user": {"$ref": "#/$defs/User"}}, - }, - function=lambda user: user, - ) - hf_tools = _convert_tools_to_hfapi_tools([tool]) - assert hf_tools is not None - assert len(hf_tools) == 1 - params = hf_tools[0].function.parameters or hf_tools[0].function.arguments - assert "$defs" not in params - assert params["properties"]["user"]["type"] == "object" diff --git a/test/components/preprocessors/test_embedding_based_document_splitter.py b/test/components/preprocessors/test_embedding_based_document_splitter.py index 4e5ccfa053..9bcb737197 100644 --- a/test/components/preprocessors/test_embedding_based_document_splitter.py +++ b/test/components/preprocessors/test_embedding_based_document_splitter.py @@ -9,7 +9,7 @@ import pytest from haystack import Document -from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder, SentenceTransformersDocumentEmbedder +from haystack.components.embedders import OpenAIDocumentEmbedder, SentenceTransformersDocumentEmbedder from haystack.components.preprocessors import EmbeddingBasedDocumentSplitter from haystack.utils import ComponentDevice @@ -403,16 +403,10 @@ def test_split_document_with_multiple_topics(self, del_hf_env_vars, monkeypatch) assert combined in original or original in combined @pytest.mark.asyncio - @pytest.mark.skipif( - not os.environ.get("TEI_URL", None), - reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.", - ) - @pytest.mark.slow + @pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set") @pytest.mark.integration async def test_split_document_with_multiple_topics_async(self) -> None: - embedder = HuggingFaceAPIDocumentEmbedder( - api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")} - ) + embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small") splitter = EmbeddingBasedDocumentSplitter( document_embedder=embedder, sentences_per_group=2, percentile=0.9, min_length=30, max_length=300 @@ -467,16 +461,10 @@ def test_trailing_whitespace_is_preserved(self, del_hf_env_vars): assert result["documents"][0].content == text @pytest.mark.asyncio - @pytest.mark.skipif( - not os.environ.get("TEI_URL", None), - reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.", - ) - @pytest.mark.slow + @pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set") @pytest.mark.integration async def test_trailing_whitespace_is_preserved_async(self) -> None: - embedder = HuggingFaceAPIDocumentEmbedder( - api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")} - ) + embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small") splitter = EmbeddingBasedDocumentSplitter(document_embedder=embedder, sentences_per_group=1) # Normal trailing whitespace @@ -524,16 +512,10 @@ def test_no_extra_whitespaces_between_sentences(self, del_hf_env_vars): ) # noqa: E501 @pytest.mark.asyncio - @pytest.mark.skipif( - not os.environ.get("TEI_URL", None), - reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.", - ) + @pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set") @pytest.mark.integration - @pytest.mark.slow async def test_no_extra_whitespaces_between_sentences_async(self) -> None: - embedder = HuggingFaceAPIDocumentEmbedder( - api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")} - ) + embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small") splitter = EmbeddingBasedDocumentSplitter( document_embedder=embedder, sentences_per_group=1, percentile=0.9, min_length=10, max_length=500 @@ -600,21 +582,15 @@ def test_split_large_splits_recursion(self, del_hf_env_vars): assert "page_number" in split_doc.meta @pytest.mark.asyncio - @pytest.mark.skipif( - not os.environ.get("TEI_URL", None), - reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.", - ) + @pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set") @pytest.mark.integration - @pytest.mark.slow async def test_split_large_splits_recursion_async(self) -> None: """ Test that _split_large_splits() works correctly without infinite loops. This test uses a longer text that will trigger the recursive splitting logic. If the chunk cannot be split further, it is allowed to be larger than max_length. """ - embedder = HuggingFaceAPIDocumentEmbedder( - api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")} - ) + embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small") semantic_chunker = EmbeddingBasedDocumentSplitter( document_embedder=embedder, sentences_per_group=5, percentile=0.95, min_length=50, max_length=1000 ) @@ -729,20 +705,14 @@ def test_split_large_splits_actually_splits(self, del_hf_env_vars): assert split_doc.meta["page_number"] == 4 @pytest.mark.asyncio - @pytest.mark.skipif( - not os.environ.get("TEI_URL", None), - reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.", - ) + @pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set") @pytest.mark.integration - @pytest.mark.slow async def test_split_large_splits_actually_splits_async(self) -> None: """ Test that _split_large_splits() actually works and can split long texts into multiple chunks. This test uses a very long text that should be split into multiple chunks. """ - embedder = HuggingFaceAPIDocumentEmbedder( - api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")} - ) + embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small") semantic_chunker = EmbeddingBasedDocumentSplitter( document_embedder=embedder, sentences_per_group=3, diff --git a/test/components/rankers/test_hugging_face_tei.py b/test/components/rankers/test_hugging_face_tei.py deleted file mode 100644 index bcfbb06020..0000000000 --- a/test/components/rankers/test_hugging_face_tei.py +++ /dev/null @@ -1,351 +0,0 @@ -# SPDX-FileCopyrightText: 2022-present deepset GmbH -# -# SPDX-License-Identifier: Apache-2.0 - -from unittest.mock import MagicMock, patch - -import httpx -import pytest - -from haystack import Document -from haystack.components.rankers.hugging_face_tei import HuggingFaceTEIRanker, TruncationDirection -from haystack.utils import Secret - - -class TestHuggingFaceTEIRanker: - def test_init(self, del_hf_env_vars): - """Test initialization with default and custom parameters""" - # Default parameters - ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com") - assert ranker.url == "https://api.my-tei-service.com" - assert ranker.top_k == 10 - assert ranker.timeout == 30 - assert not ranker.token.resolve_value() - assert ranker.max_retries == 3 - assert ranker.retry_status_codes is None - - # Custom parameters - token = Secret.from_token("my_api_token") - ranker = HuggingFaceTEIRanker( - url="https://api.my-tei-service.com", - top_k=5, - timeout=60, - token=token, - max_retries=5, - retry_status_codes=[500, 502, 503], - ) - assert ranker.url == "https://api.my-tei-service.com" - assert ranker.top_k == 5 - assert ranker.timeout == 60 - assert ranker.token == token - assert ranker.max_retries == 5 - assert ranker.retry_status_codes == [500, 502, 503] - - def test_to_dict(self, del_hf_env_vars): - """Test serialization to dict with Secret token""" - component = HuggingFaceTEIRanker( - url="https://api.my-tei-service.com", top_k=5, timeout=30, max_retries=4, retry_status_codes=[500, 502] - ) - data = component.to_dict() - - assert data["type"] == "haystack.components.rankers.hugging_face_tei.HuggingFaceTEIRanker" - assert data["init_parameters"]["url"] == "https://api.my-tei-service.com" - assert data["init_parameters"]["top_k"] == 5 - assert data["init_parameters"]["timeout"] == 30 - assert data["init_parameters"]["token"] == { - "env_vars": ["HF_API_TOKEN", "HF_TOKEN"], - "strict": False, - "type": "env_var", - } - assert data["init_parameters"]["max_retries"] == 4 - assert data["init_parameters"]["retry_status_codes"] == [500, 502] - - def test_from_dict(self, del_hf_env_vars): - """Test deserialization from dict with environment variable token""" - data = { - "type": "haystack.components.rankers.hugging_face_tei.HuggingFaceTEIRanker", - "init_parameters": { - "url": "https://api.my-tei-service.com", - "top_k": 5, - "timeout": 30, - "token": {"type": "env_var", "env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False}, - "max_retries": 4, - "retry_status_codes": [500, 502], - }, - } - - component = HuggingFaceTEIRanker.from_dict(data) - - assert component.url == "https://api.my-tei-service.com" - assert component.top_k == 5 - assert component.timeout == 30 - assert component.max_retries == 4 - assert component.retry_status_codes == [500, 502] - - def test_empty_documents(self, del_hf_env_vars): - """Test that empty documents list returns empty result""" - ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com") - result = ranker.run(query="test query", documents=[]) - assert result == {"documents": []} - - @patch("haystack.components.rankers.hugging_face_tei.request_with_retry") - def test_run_with_mock(self, mock_request, del_hf_env_vars): - """Test run method with mocked API response""" - # Setup mock response - mock_response = MagicMock(spec=httpx.Response) - mock_response.json.return_value = [ - {"index": 2, "score": 0.95}, - {"index": 1, "score": 0.85}, - {"index": 0, "score": 0.75}, - ] - mock_request.return_value = mock_response - - # Create ranker and test documents - token = Secret.from_token("test_token") - ranker = HuggingFaceTEIRanker( - url="https://api.my-tei-service.com", - top_k=3, - timeout=30, - token=token, - max_retries=4, - retry_status_codes=[500, 502], - ) - - docs = [Document(content="Document A"), Document(content="Document B"), Document(content="Document C")] - - # Run the ranker - result = ranker.run(query="test query", documents=docs) - - # Check that request_with_retry was called with correct parameters - mock_request.assert_called_once_with( - method="POST", - url="https://api.my-tei-service.com/rerank", - json={"query": "test query", "texts": ["Document A", "Document B", "Document C"], "raw_scores": False}, - timeout=30, - headers={"Authorization": "Bearer test_token"}, - attempts=4, - status_codes_to_retry=[500, 502], - ) - - # Check that documents are ranked correctly - assert len(result["documents"]) == 3 - assert result["documents"][0].content == "Document C" - assert result["documents"][0].score == 0.95 - assert result["documents"][1].content == "Document B" - assert result["documents"][1].score == 0.85 - assert result["documents"][2].content == "Document A" - assert result["documents"][2].score == 0.75 - - @patch("haystack.components.rankers.hugging_face_tei.request_with_retry") - def test_run_with_truncation_direction(self, mock_request, del_hf_env_vars): - """Test run method with truncation direction parameter""" - # Setup mock response - mock_response = MagicMock(spec=httpx.Response) - mock_response.json.return_value = [{"index": 0, "score": 0.95}] - mock_request.return_value = mock_response - - # Create ranker and test documents - token = Secret.from_token("test_token") - ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com", token=token) - docs = [Document(content="Document A")] - - # Run the ranker with truncation direction - ranker.run(query="test query", documents=docs, truncation_direction=TruncationDirection.LEFT) - - # Check that request includes truncation parameters - mock_request.assert_called_once_with( - method="POST", - url="https://api.my-tei-service.com/rerank", - json={ - "query": "test query", - "texts": ["Document A"], - "raw_scores": False, - "truncate": True, - "truncation_direction": "Left", - }, - timeout=30, - headers={"Authorization": "Bearer test_token"}, - attempts=3, - status_codes_to_retry=None, - ) - - @patch("haystack.components.rankers.hugging_face_tei.request_with_retry") - def test_run_with_custom_top_k(self, mock_request, del_hf_env_vars): - """Test run method with custom top_k parameter""" - # Setup mock response with 5 documents - mock_response = MagicMock(spec=httpx.Response) - mock_response.json.return_value = [ - {"index": 4, "score": 0.95}, - {"index": 3, "score": 0.90}, - {"index": 2, "score": 0.85}, - {"index": 1, "score": 0.80}, - {"index": 0, "score": 0.75}, - ] - mock_request.return_value = mock_response - - # Create ranker with top_k=3 - ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com", top_k=3) - - # Create 5 test documents - docs = [Document(content=f"Document {i}") for i in range(5)] - - # Run the ranker - result = ranker.run(query="test query", documents=docs) - - # Check that only top 3 documents are returned - assert len(result["documents"]) == 3 - assert result["documents"][0].content == "Document 4" - assert result["documents"][1].content == "Document 3" - assert result["documents"][2].content == "Document 2" - - # Test with run-time top_k override - result = ranker.run(query="test query", documents=docs, top_k=2) - - # Check that only top 2 documents are returned - assert len(result["documents"]) == 2 - assert result["documents"][0].content == "Document 4" - assert result["documents"][1].content == "Document 3" - - @patch("haystack.components.rankers.hugging_face_tei.request_with_retry") - def test_run_deduplicates_documents(self, mock_request, del_hf_env_vars): - """Test that duplicate documents are removed before sending to the API.""" - mock_response = MagicMock(spec=httpx.Response) - mock_response.json.return_value = [{"index": 1, "score": 0.9}, {"index": 0, "score": 0.2}] - mock_request.return_value = mock_response - - ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com") - # Document with duplicate id and lower score should be dropped - docs = [ - Document(id="duplicate", content="keep me", score=0.9), - Document(id="duplicate", content="drop me", score=0.1), - Document(id="unique", content="unique"), - ] - - result = ranker.run(query="test query", documents=docs) - - mock_request.assert_called_once_with( - method="POST", - url="https://api.my-tei-service.com/rerank", - json={"query": "test query", "texts": ["keep me", "unique"], "raw_scores": False}, - timeout=30, - headers={}, - attempts=3, - status_codes_to_retry=None, - ) - assert len(result["documents"]) == 2 - assert result["documents"][0].content == "unique" - assert result["documents"][1].content == "keep me" - - @patch("haystack.components.rankers.hugging_face_tei.request_with_retry") - def test_error_handling(self, mock_request, del_hf_env_vars): - """Test error handling in the ranker""" - # Setup mock response with error - mock_response = MagicMock(spec=httpx.Response) - mock_response.json.return_value = {"error": "Some error occurred", "error_type": "TestError"} - mock_request.return_value = mock_response - - # Create ranker and test documents - ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com") - docs = [Document(content="Document A")] - - # Test that RuntimeError is raised with the correct message - with pytest.raises( - RuntimeError, match=r"HuggingFaceTEIRanker API call failed \(TestError\): Some error occurred" - ): - ranker.run(query="test query", documents=docs) - - # Test unexpected response format - mock_response.json.return_value = {"unexpected": "format"} - with pytest.raises(TypeError, match="Unexpected response format from text-embeddings-inference rerank API"): - ranker.run(query="test query", documents=docs) - - @pytest.mark.asyncio - @patch("haystack.components.rankers.hugging_face_tei.async_request_with_retry") - async def test_run_async_with_mock(self, mock_request, del_hf_env_vars): - """Test run_async method with mocked API response""" - # Setup mock response - mock_response = MagicMock(spec=httpx.Response) - mock_response.json.return_value = [ - {"index": 2, "score": 0.95}, - {"index": 1, "score": 0.85}, - {"index": 0, "score": 0.75}, - ] - mock_request.return_value = mock_response - - # Create ranker and test documents - token = Secret.from_token("test_token") - ranker = HuggingFaceTEIRanker( - url="https://api.my-tei-service.com", - top_k=3, - timeout=30, - token=token, - max_retries=4, - retry_status_codes=[500, 502], - ) - - docs = [Document(content="Document A"), Document(content="Document B"), Document(content="Document C")] - - # Run the ranker asynchronously - result = await ranker.run_async(query="test query", documents=docs) - - # Check that async_request_with_retry was called with correct parameters - mock_request.assert_called_once_with( - method="POST", - url="https://api.my-tei-service.com/rerank", - json={"query": "test query", "texts": ["Document A", "Document B", "Document C"], "raw_scores": False}, - timeout=30, - headers={"Authorization": "Bearer test_token"}, - attempts=4, - status_codes_to_retry=[500, 502], - ) - - # Check that documents are ranked correctly - assert len(result["documents"]) == 3 - assert result["documents"][0].content == "Document C" - assert result["documents"][0].score == 0.95 - assert result["documents"][1].content == "Document B" - assert result["documents"][1].score == 0.85 - assert result["documents"][2].content == "Document A" - assert result["documents"][2].score == 0.75 - - @pytest.mark.asyncio - @patch("haystack.components.rankers.hugging_face_tei.async_request_with_retry") - async def test_run_async_deduplicates_documents(self, mock_request, del_hf_env_vars): - """Test that duplicate documents are removed before sending to the API.""" - mock_response = MagicMock(spec=httpx.Response) - mock_response.json.return_value = [{"index": 1, "score": 0.9}, {"index": 0, "score": 0.2}] - mock_request.return_value = mock_response - - ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com") - # Document with duplicate id and lower score should be dropped - docs = [ - Document(id="duplicate", content="keep me", score=0.9), - Document(id="duplicate", content="drop me", score=0.1), - Document(id="unique", content="unique"), - ] - - result = await ranker.run_async(query="test query", documents=docs) - - mock_request.assert_called_once_with( - method="POST", - url="https://api.my-tei-service.com/rerank", - json={"query": "test query", "texts": ["keep me", "unique"], "raw_scores": False}, - timeout=30, - headers={}, - attempts=3, - status_codes_to_retry=None, - ) - assert len(result["documents"]) == 2 - assert result["documents"][0].content == "unique" - assert result["documents"][1].content == "keep me" - - @pytest.mark.asyncio - @patch("haystack.components.rankers.hugging_face_tei.async_request_with_retry") - async def test_run_async_empty_documents(self, mock_request, del_hf_env_vars): - """Test run_async with empty documents list""" - ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com") - result = await ranker.run_async(query="test query", documents=[]) - - # Check that no API call was made - mock_request.assert_not_called() - assert result == {"documents": []}