diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index 690e41120c..61357ebd0a 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -60,8 +60,6 @@ jobs:
               - "haystack/components/audio/whisper_local.py"
               - "haystack/components/classifiers/zero_shot_document_classifier.py"
               - "haystack/components/converters/tika.py"
-              - "haystack/components/embedders/hugging_face_api_document_embedder.py"
-              - "haystack/components/embedders/hugging_face_api_text_embedder.py"
               - "haystack/components/embedders/backends/sentence_transformers_backend.py"
               - "haystack/components/embedders/backends/sentence_transformers_sparse_backend.py"
               - "haystack/components/embedders/image/sentence_transformers_doc_image_embedder.py"
@@ -69,10 +67,7 @@ jobs:
               - "haystack/components/embedders/sentence_transformers_sparse_document_embedder.py"
               - "haystack/components/embedders/sentence_transformers_sparse_text_embedder.py"
               - "haystack/components/evaluators/sas_evaluator.py"
-              - "haystack/components/generators/chat/hugging_face_api.py"
               - "haystack/components/generators/chat/hugging_face_local.py"
-              - "haystack/components/generators/hugging_face_api.py"
-              - "haystack/components/generators/hugging_face_local_generator.py"
               - "haystack/components/generators/openai_dalle.py"
               - "haystack/components/preprocessors/embedding_based_document_splitter.py"
               - "haystack/components/rankers/sentence_transformers_diversity.py"
@@ -86,17 +81,12 @@ jobs:
               - "test/components/audio/test_whisper_local.py"
               - "test/components/classifiers/test_zero_shot_document_classifier.py"
               - "test/components/converters/test_tika_doc_converter.py"
-              - "test/components/embedders/test_hugging_face_api_document_embedder.py"
-              - "test/components/embedders/test_hugging_face_api_text_embedder.py"
               - "test/components/embedders/image/test_sentence_transformers_doc_image_embedder.py"
               - "test/components/embedders/test_sentence_transformers_text_embedder.py"
               - "test/components/embedders/test_sentence_transformers_sparse_document_embedder.py"
               - "test/components/embedders/test_sentence_transformers_sparse_text_embedder.py"
               - "test/components/evaluators/test_sas_evaluator.py"
-              - "test/components/generators/chat/test_hugging_face_api.py"
               - "test/components/generators/chat/test_hugging_face_local.py"
-              - "test/components/generators/test_hugging_face_api.py"
-              - "test/components/generators/test_hugging_face_local_generator.py"
               - "test/components/generators/test_openai_dalle.py"
               - "test/components/preprocessors/test_embedding_based_document_splitter.py"
               - "test/components/rankers/test_sentence_transformers_diversity.py"
diff --git a/MIGRATION.md b/MIGRATION.md
index 697fda4614..21722c72e6 100644
--- a/MIGRATION.md
+++ b/MIGRATION.md
@@ -40,6 +40,8 @@ component = NewComponent(new_param="value")
 - **One entry per breaking change.** Don't bundle unrelated changes into a single entry.
 - **Include a working code example** for every rename, removal, or signature change.
 - **Link to the PR** when extra context would help (e.g. `See [#1234](https://github.com/deepset-ai/haystack/pull/1234)`).
+- **Components moved to external packages** don't need a full entry: add a row to the table in
+  [Components Moved to External Packages](#components-moved-to-external-packages) instead.
 
 ---
 
@@ -71,6 +73,27 @@ from haystack.dataclasses import Document
 doc = Document(content="col\n1\n2\n3")
 ```
 
+### Components Moved to External Packages
+
+**What changed:** Some components have been moved out of Haystack into dedicated integration packages,
+hosted in the [haystack-core-integrations](https://github.com/deepset-ai/haystack-core-integrations) repository.
+
+**Why:** Moving these components to separate packages allows testing more thoroughly in isolation and
+releasing fixes independently of the Haystack release cycle. This also makes Haystack development and CI leaner.
+
+**How to migrate:** Install the new package and update your imports as shown in the table below.
+
+```bash
+pip install <new-package>
+```
+
+| Old import (`haystack-ai<3.0.0`) | New package | New import |
+|---|---|---|
+| `from haystack.components.generators.chat import HuggingFaceAPIChatGenerator` | `huggingface-api-haystack` | `from haystack_integrations.components.generators.huggingface_api import HuggingFaceAPIChatGenerator` |
+| `from haystack.components.embedders import HuggingFaceAPITextEmbedder` | `huggingface-api-haystack` | `from haystack_integrations.components.embedders.huggingface_api import HuggingFaceAPITextEmbedder` |
+| `from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder` | `huggingface-api-haystack` | `from haystack_integrations.components.embedders.huggingface_api import HuggingFaceAPIDocumentEmbedder` |
+| `from haystack.components.rankers import HuggingFaceTEIRanker` | `huggingface-api-haystack` | `from haystack_integrations.components.rankers.huggingface_api import HuggingFaceTEIRanker` |
+
 ### ToolInvoker component removed
 
 **What changed:** The `ToolInvoker` component has been removed. Imports from `haystack.components.tools`
diff --git a/haystack/components/embedders/__init__.py b/haystack/components/embedders/__init__.py
index a6c92ca3b3..d15d5cf761 100644
--- a/haystack/components/embedders/__init__.py
+++ b/haystack/components/embedders/__init__.py
@@ -10,8 +10,6 @@
 _import_structure = {
     "azure_document_embedder": ["AzureOpenAIDocumentEmbedder"],
     "azure_text_embedder": ["AzureOpenAITextEmbedder"],
-    "hugging_face_api_document_embedder": ["HuggingFaceAPIDocumentEmbedder"],
-    "hugging_face_api_text_embedder": ["HuggingFaceAPITextEmbedder"],
     "openai_document_embedder": ["OpenAIDocumentEmbedder"],
     "openai_text_embedder": ["OpenAITextEmbedder"],
     "sentence_transformers_document_embedder": ["SentenceTransformersDocumentEmbedder"],
@@ -23,8 +21,6 @@
 if TYPE_CHECKING:
     from .azure_document_embedder import AzureOpenAIDocumentEmbedder as AzureOpenAIDocumentEmbedder
     from .azure_text_embedder import AzureOpenAITextEmbedder as AzureOpenAITextEmbedder
-    from .hugging_face_api_document_embedder import HuggingFaceAPIDocumentEmbedder as HuggingFaceAPIDocumentEmbedder
-    from .hugging_face_api_text_embedder import HuggingFaceAPITextEmbedder as HuggingFaceAPITextEmbedder
     from .openai_document_embedder import OpenAIDocumentEmbedder as OpenAIDocumentEmbedder
     from .openai_text_embedder import OpenAITextEmbedder as OpenAITextEmbedder
     from .sentence_transformers_document_embedder import (
diff --git a/haystack/components/embedders/hugging_face_api_document_embedder.py b/haystack/components/embedders/hugging_face_api_document_embedder.py
deleted file mode 100644
index 0b2951f8ed..0000000000
--- a/haystack/components/embedders/hugging_face_api_document_embedder.py
+++ /dev/null
@@ -1,378 +0,0 @@
-# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from asyncio import Semaphore, gather
-from dataclasses import replace
-from itertools import chain
-from typing import Any
-
-from tqdm import tqdm
-
-from haystack import component, default_from_dict, default_to_dict, logging
-from haystack.dataclasses import Document
-from haystack.lazy_imports import LazyImport
-from haystack.utils import Secret
-from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model
-from haystack.utils.url_validation import is_valid_http_url
-
-with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import:
-    from huggingface_hub import AsyncInferenceClient, InferenceClient
-
-logger = logging.getLogger(__name__)
-
-
-@component
-class HuggingFaceAPIDocumentEmbedder:
-    """
-    Embeds documents using Hugging Face APIs.
-
-    Use it with the following Hugging Face APIs:
-    - [Free Serverless Inference API](https://huggingface.co/inference-api)
-    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
-    - [Self-hosted Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference)
-
-
-    ### Usage examples
-
-    #### With free serverless inference API
-    <!-- test-ignore -->
-    ```python
-    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
-    from haystack.utils import Secret
-    from haystack.dataclasses import Document
-
-    doc = Document(content="I love pizza!")
-
-    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="serverless_inference_api",
-                                                  api_params={"model": "BAAI/bge-small-en-v1.5"},
-                                                  token=Secret.from_token("<your-api-key>"))
-
-    result = document_embedder.run([doc])
-    print(result["documents"][0].embedding)
-
-    # [0.017020374536514282, -0.023255806416273117, ...]
-    ```
-
-    #### With paid inference endpoints
-    <!-- test-ignore -->
-    ```python
-    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
-    from haystack.utils import Secret
-    from haystack.dataclasses import Document
-
-    doc = Document(content="I love pizza!")
-
-    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="inference_endpoints",
-                                                  api_params={"url": "<your-inference-endpoint-url>"},
-                                                  token=Secret.from_token("<your-api-key>"))
-
-    result = document_embedder.run([doc])
-    print(result["documents"][0].embedding)
-
-    # [0.017020374536514282, -0.023255806416273117, ...]
-    ```
-
-    #### With self-hosted text embeddings inference
-    <!-- test-ignore -->
-    ```python
-    from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
-    from haystack.dataclasses import Document
-
-    doc = Document(content="I love pizza!")
-
-    doc_embedder = HuggingFaceAPIDocumentEmbedder(api_type="text_embeddings_inference",
-                                                  api_params={"url": "http://localhost:8080"})
-
-    result = document_embedder.run([doc])
-    print(result["documents"][0].embedding)
-
-    # [0.017020374536514282, -0.023255806416273117, ...]
-    ```
-    """
-
-    def __init__(
-        self,
-        api_type: HFEmbeddingAPIType | str,
-        api_params: dict[str, str],
-        token: Secret | None = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
-        prefix: str = "",
-        suffix: str = "",
-        truncate: bool | None = True,
-        normalize: bool | None = False,
-        batch_size: int = 32,
-        progress_bar: bool = True,
-        meta_fields_to_embed: list[str] | None = None,
-        embedding_separator: str = "\n",
-        concurrency_limit: int = 4,
-    ) -> None:
-        """
-        Creates a HuggingFaceAPIDocumentEmbedder component.
-
-        :param api_type:
-            The type of Hugging Face API to use.
-        :param api_params:
-            A dictionary with the following keys:
-            - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
-            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
-            `TEXT_EMBEDDINGS_INFERENCE`.
-        :param token: The Hugging Face token to use as HTTP bearer authorization.
-            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
-        :param prefix:
-            A string to add at the beginning of each text.
-        :param suffix:
-            A string to add at the end of each text.
-        :param truncate:
-            Truncates the input text to the maximum length supported by the model.
-            Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS`
-            if the backend uses Text Embeddings Inference.
-            If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored.
-        :param normalize:
-            Normalizes the embeddings to unit length.
-            Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS`
-            if the backend uses Text Embeddings Inference.
-            If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored.
-        :param batch_size:
-            Number of documents to process at once.
-        :param progress_bar:
-            If `True`, shows a progress bar when running.
-        :param meta_fields_to_embed:
-            List of metadata fields to embed along with the document text.
-        :param embedding_separator:
-            Separator used to concatenate the metadata fields to the document text.
-        :param concurrency_limit:
-            The maximum number of requests that should be allowed to run concurrently.
-            This parameter is only used in the `run_async` method.
-        """
-        huggingface_hub_import.check()
-
-        if isinstance(api_type, str):
-            api_type = HFEmbeddingAPIType.from_str(api_type)
-
-        api_params = api_params or {}
-
-        if api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
-            model = api_params.get("model")
-            if model is None:
-                raise ValueError(
-                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
-                )
-            check_valid_model(model, HFModelType.EMBEDDING, token)
-            model_or_url = model
-        elif api_type in [HFEmbeddingAPIType.INFERENCE_ENDPOINTS, HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE]:
-            url = api_params.get("url")
-            if url is None:
-                msg = (
-                    "To use Text Embeddings Inference or Inference Endpoints, you need to specify the `url` "
-                    "parameter in `api_params`."
-                )
-                raise ValueError(msg)
-            if not is_valid_http_url(url):
-                raise ValueError(f"Invalid URL: {url}")
-            model_or_url = url
-        else:
-            msg = f"Unknown api_type {api_type}"
-            raise ValueError(msg)
-
-        client_args: dict[str, Any] = {"model": model_or_url, "token": token.resolve_value() if token else None}
-
-        self.api_type = api_type
-        self.api_params = api_params
-        self.token = token
-        self.prefix = prefix
-        self.suffix = suffix
-        self.truncate = truncate
-        self.normalize = normalize
-        self.batch_size = batch_size
-        self.progress_bar = progress_bar
-        self.meta_fields_to_embed = meta_fields_to_embed or []
-        self.embedding_separator = embedding_separator
-        self.concurrency_limit = concurrency_limit
-        self._client = InferenceClient(**client_args)
-        self._async_client = AsyncInferenceClient(**client_args)
-
-    def to_dict(self) -> dict[str, Any]:
-        """
-        Serializes the component to a dictionary.
-
-        :returns:
-            Dictionary with serialized data.
-        """
-        return default_to_dict(
-            self,
-            api_type=str(self.api_type),
-            api_params=self.api_params,
-            prefix=self.prefix,
-            suffix=self.suffix,
-            token=self.token,
-            truncate=self.truncate,
-            normalize=self.normalize,
-            batch_size=self.batch_size,
-            progress_bar=self.progress_bar,
-            meta_fields_to_embed=self.meta_fields_to_embed,
-            embedding_separator=self.embedding_separator,
-            concurrency_limit=self.concurrency_limit,
-        )
-
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> "HuggingFaceAPIDocumentEmbedder":
-        """
-        Deserializes the component from a dictionary.
-
-        :param data:
-            Dictionary to deserialize from.
-        :returns:
-            Deserialized component.
-        """
-        return default_from_dict(cls, data)
-
-    def _prepare_texts_to_embed(self, documents: list[Document]) -> list[str]:
-        """
-        Prepare the texts to embed by concatenating the Document text with the metadata fields to embed.
-        """
-        texts_to_embed = []
-        for doc in documents:
-            meta_values_to_embed = [
-                str(doc.meta[key]) for key in self.meta_fields_to_embed if key in doc.meta and doc.meta[key] is not None
-            ]
-
-            text_to_embed = (
-                self.prefix + self.embedding_separator.join(meta_values_to_embed + [doc.content or ""]) + self.suffix
-            )
-
-            texts_to_embed.append(text_to_embed)
-        return texts_to_embed
-
-    @staticmethod
-    def _adjust_api_parameters(
-        truncate: bool | None, normalize: bool | None, api_type: HFEmbeddingAPIType
-    ) -> tuple[bool | None, bool | None]:
-        """
-        Adjust the truncate and normalize parameters based on the API type.
-        """
-        if api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
-            if truncate is not None:
-                msg = "`truncate` parameter is not supported for Serverless Inference API. It will be ignored."
-                logger.warning(msg)
-                truncate = None
-            if normalize is not None:
-                msg = "`normalize` parameter is not supported for Serverless Inference API. It will be ignored."
-                logger.warning(msg)
-                normalize = None
-        return truncate, normalize
-
-    def _embed_batch(self, texts_to_embed: list[str], batch_size: int) -> list[list[float]]:
-        """
-        Embed a list of texts in batches.
-        """
-        truncate, normalize = self._adjust_api_parameters(self.truncate, self.normalize, self.api_type)
-
-        all_embeddings: list = []
-        for i in tqdm(
-            range(0, len(texts_to_embed), batch_size), disable=not self.progress_bar, desc="Calculating embeddings"
-        ):
-            batch = texts_to_embed[i : i + batch_size]
-
-            np_embeddings = self._client.feature_extraction(text=batch, truncate=truncate, normalize=normalize)
-
-            if np_embeddings.ndim != 2 or np_embeddings.shape[0] != len(batch):
-                raise ValueError(f"Expected embedding shape ({batch_size}, embedding_dim), got {np_embeddings.shape}")
-
-            all_embeddings.extend(np_embeddings.tolist())
-
-        return all_embeddings
-
-    async def _embed_batch_async(self, texts_to_embed: list[str], batch_size: int) -> list[list[float]]:
-        """
-        Embed a list of texts in batches asynchronously.
-        """
-        truncate, normalize = self._adjust_api_parameters(self.truncate, self.normalize, self.api_type)
-        sem = Semaphore(max(1, self.concurrency_limit))
-        num_batches = (len(texts_to_embed) + batch_size - 1) // batch_size
-        pbar = tqdm(total=num_batches, disable=not self.progress_bar, desc="Calculating embeddings")
-
-        async def _runner(batch: list[str]) -> list[list[float]]:
-            async with sem:
-                np_embeddings = await self._async_client.feature_extraction(
-                    text=batch, truncate=truncate, normalize=normalize
-                )
-
-                if np_embeddings.ndim != 2 or np_embeddings.shape[0] != len(batch):
-                    raise ValueError(
-                        f"Expected embedding shape ({batch_size}, embedding_dim), got {np_embeddings.shape}"
-                    )
-
-                pbar.update(1)
-                return np_embeddings.tolist()
-
-        try:
-            all_embeddings = [
-                *chain(
-                    *await gather(
-                        *[
-                            _runner(texts_to_embed[i : i + batch_size])
-                            for i in range(0, len(texts_to_embed), batch_size)
-                        ]
-                    )
-                )
-            ]
-        finally:
-            pbar.close()
-
-        return all_embeddings
-
-    @component.output_types(documents=list[Document])
-    def run(self, documents: list[Document]) -> dict[str, list[Document]]:
-        """
-        Embeds a list of documents.
-
-        :param documents:
-            Documents to embed.
-
-        :returns:
-            A dictionary with the following keys:
-            - `documents`: A list of documents with embeddings.
-        """
-        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
-            raise TypeError(
-                "HuggingFaceAPIDocumentEmbedder expects a list of Documents as input."
-                " In case you want to embed a string, please use the HuggingFaceAPITextEmbedder."
-            )
-
-        texts_to_embed = self._prepare_texts_to_embed(documents=documents)
-
-        embeddings = self._embed_batch(texts_to_embed=texts_to_embed, batch_size=self.batch_size)
-
-        new_documents = []
-        for doc, emb in zip(documents, embeddings, strict=True):
-            new_documents.append(replace(doc, embedding=emb))
-
-        return {"documents": new_documents}
-
-    @component.output_types(documents=list[Document])
-    async def run_async(self, documents: list[Document]) -> dict[str, list[Document]]:
-        """
-        Embeds a list of documents asynchronously.
-
-        :param documents:
-            Documents to embed.
-
-        :returns:
-            A dictionary with the following keys:
-            - `documents`: A list of documents with embeddings.
-        """
-        if not isinstance(documents, list) or documents and not isinstance(documents[0], Document):
-            raise TypeError(
-                "HuggingFaceAPIDocumentEmbedder expects a list of Documents as input."
-                " In case you want to embed a string, please use the HuggingFaceAPITextEmbedder."
-            )
-
-        texts_to_embed = self._prepare_texts_to_embed(documents=documents)
-
-        embeddings = await self._embed_batch_async(texts_to_embed=texts_to_embed, batch_size=self.batch_size)
-
-        new_documents = []
-        for doc, emb in zip(documents, embeddings, strict=True):
-            new_documents.append(replace(doc, embedding=emb))
-
-        return {"documents": new_documents}
diff --git a/haystack/components/embedders/hugging_face_api_text_embedder.py b/haystack/components/embedders/hugging_face_api_text_embedder.py
deleted file mode 100644
index 5eb45a8c73..0000000000
--- a/haystack/components/embedders/hugging_face_api_text_embedder.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Any
-
-from haystack import component, default_from_dict, default_to_dict, logging
-from haystack.lazy_imports import LazyImport
-from haystack.utils import Secret
-from haystack.utils.hf import HFEmbeddingAPIType, HFModelType, check_valid_model
-from haystack.utils.url_validation import is_valid_http_url
-
-with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import:
-    from huggingface_hub import AsyncInferenceClient, InferenceClient
-
-logger = logging.getLogger(__name__)
-
-
-@component
-class HuggingFaceAPITextEmbedder:
-    """
-    Embeds strings using Hugging Face APIs.
-
-    Use it with the following Hugging Face APIs:
-    - [Free Serverless Inference API](https://huggingface.co/inference-api)
-    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
-    - [Self-hosted Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference)
-
-    ### Usage examples
-
-    #### With free serverless inference API
-    <!-- test-ignore -->
-    ```python
-    from haystack.components.embedders import HuggingFaceAPITextEmbedder
-    from haystack.utils import Secret
-
-    text_embedder = HuggingFaceAPITextEmbedder(api_type="serverless_inference_api",
-                                               api_params={"model": "BAAI/bge-small-en-v1.5"},
-                                               token=Secret.from_token("<your-api-key>"))
-
-    print(text_embedder.run("I love pizza!"))
-
-    # {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
-    ```
-
-    #### With paid inference endpoints
-    <!-- test-ignore -->
-    ```python
-    from haystack.components.embedders import HuggingFaceAPITextEmbedder
-    from haystack.utils import Secret
-    text_embedder = HuggingFaceAPITextEmbedder(api_type="inference_endpoints",
-                                               api_params={"model": "BAAI/bge-small-en-v1.5"},
-                                               token=Secret.from_token("<your-api-key>"))
-
-    print(text_embedder.run("I love pizza!"))
-
-    # {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
-    ```
-
-    #### With self-hosted text embeddings inference
-    <!-- test-ignore -->
-    ```python
-    from haystack.components.embedders import HuggingFaceAPITextEmbedder
-    from haystack.utils import Secret
-
-    text_embedder = HuggingFaceAPITextEmbedder(api_type="text_embeddings_inference",
-                                               api_params={"url": "http://localhost:8080"})
-
-    print(text_embedder.run("I love pizza!"))
-
-    # {'embedding': [0.017020374536514282, -0.023255806416273117, ...],
-    ```
-    """
-
-    def __init__(
-        self,
-        api_type: HFEmbeddingAPIType | str,
-        api_params: dict[str, str],
-        token: Secret | None = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
-        prefix: str = "",
-        suffix: str = "",
-        truncate: bool | None = True,
-        normalize: bool | None = False,
-    ) -> None:
-        """
-        Creates a HuggingFaceAPITextEmbedder component.
-
-        :param api_type:
-            The type of Hugging Face API to use.
-        :param api_params:
-            A dictionary with the following keys:
-            - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
-            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
-            `TEXT_EMBEDDINGS_INFERENCE`.
-        :param token: The Hugging Face token to use as HTTP bearer authorization.
-            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
-        :param prefix:
-            A string to add at the beginning of each text.
-        :param suffix:
-            A string to add at the end of each text.
-        :param truncate:
-            Truncates the input text to the maximum length supported by the model.
-            Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS`
-            if the backend uses Text Embeddings Inference.
-            If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored.
-        :param normalize:
-            Normalizes the embeddings to unit length.
-            Applicable when `api_type` is `TEXT_EMBEDDINGS_INFERENCE`, or `INFERENCE_ENDPOINTS`
-            if the backend uses Text Embeddings Inference.
-            If `api_type` is `SERVERLESS_INFERENCE_API`, this parameter is ignored.
-        """
-        huggingface_hub_import.check()
-
-        if isinstance(api_type, str):
-            api_type = HFEmbeddingAPIType.from_str(api_type)
-
-        if api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
-            model = api_params.get("model")
-            if model is None:
-                raise ValueError(
-                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
-                )
-            check_valid_model(model, HFModelType.EMBEDDING, token)
-            model_or_url = model
-        elif api_type in [HFEmbeddingAPIType.INFERENCE_ENDPOINTS, HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE]:
-            url = api_params.get("url")
-            if url is None:
-                msg = (
-                    "To use Text Embeddings Inference or Inference Endpoints, you need to specify the `url` "
-                    "parameter in `api_params`."
-                )
-                raise ValueError(msg)
-            if not is_valid_http_url(url):
-                raise ValueError(f"Invalid URL: {url}")
-            model_or_url = url
-        else:
-            msg = f"Unknown api_type {api_type}"
-            raise ValueError(msg)
-
-        self.api_type = api_type
-        self.api_params = api_params
-        self.token = token
-        self.prefix = prefix
-        self.suffix = suffix
-        self.truncate = truncate
-        self.normalize = normalize
-        self._client = InferenceClient(model_or_url, token=token.resolve_value() if token else None)
-        self._async_client = AsyncInferenceClient(model_or_url, token=token.resolve_value() if token else None)
-
-    def _prepare_input(self, text: str) -> tuple[str, bool | None, bool | None]:
-        if not isinstance(text, str):
-            raise TypeError(
-                "HuggingFaceAPITextEmbedder expects a string as an input."
-                "In case you want to embed a list of Documents, please use the HuggingFaceAPIDocumentEmbedder."
-            )
-
-        truncate = self.truncate
-        normalize = self.normalize
-
-        if self.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API:
-            if truncate is not None:
-                msg = "`truncate` parameter is not supported for Serverless Inference API. It will be ignored."
-                logger.warning(msg)
-                truncate = None
-            if normalize is not None:
-                msg = "`normalize` parameter is not supported for Serverless Inference API. It will be ignored."
-                logger.warning(msg)
-                normalize = None
-
-        text_to_embed = self.prefix + text + self.suffix
-
-        return text_to_embed, truncate, normalize
-
-    def to_dict(self) -> dict[str, Any]:
-        """
-        Serializes the component to a dictionary.
-
-        :returns:
-            Dictionary with serialized data.
-        """
-        return default_to_dict(
-            self,
-            api_type=str(self.api_type),
-            api_params=self.api_params,
-            prefix=self.prefix,
-            suffix=self.suffix,
-            token=self.token,
-            truncate=self.truncate,
-            normalize=self.normalize,
-        )
-
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> "HuggingFaceAPITextEmbedder":
-        """
-        Deserializes the component from a dictionary.
-
-        :param data:
-            Dictionary to deserialize from.
-        :returns:
-            Deserialized component.
-        """
-        return default_from_dict(cls, data)
-
-    @component.output_types(embedding=list[float])
-    def run(self, text: str) -> dict[str, Any]:
-        """
-        Embeds a single string.
-
-        :param text:
-            Text to embed.
-
-        :returns:
-            A dictionary with the following keys:
-            - `embedding`: The embedding of the input text.
-        """
-        text_to_embed, truncate_val, normalize_val = self._prepare_input(text)
-
-        np_embedding = self._client.feature_extraction(
-            text=text_to_embed, truncate=truncate_val, normalize=normalize_val
-        )
-
-        error_msg = f"Expected embedding shape (1, embedding_dim) or (embedding_dim,), got {np_embedding.shape}"
-        if np_embedding.ndim > 2:
-            raise ValueError(error_msg)
-        if np_embedding.ndim == 2 and np_embedding.shape[0] != 1:
-            raise ValueError(error_msg)
-
-        embedding = np_embedding.flatten().tolist()
-
-        return {"embedding": embedding}
-
-    @component.output_types(embedding=list[float])
-    async def run_async(self, text: str) -> dict[str, Any]:
-        """
-        Embeds a single string asynchronously.
-
-        :param text:
-            Text to embed.
-
-        :returns:
-            A dictionary with the following keys:
-            - `embedding`: The embedding of the input text.
-        """
-        text_to_embed, truncate_val, normalize_val = self._prepare_input(text)
-
-        np_embedding = await self._async_client.feature_extraction(
-            text=text_to_embed, truncate=truncate_val, normalize=normalize_val
-        )
-
-        error_msg = f"Expected embedding shape (1, embedding_dim) or (embedding_dim,), got {np_embedding.shape}"
-        if np_embedding.ndim > 2:
-            raise ValueError(error_msg)
-        if np_embedding.ndim == 2 and np_embedding.shape[0] != 1:
-            raise ValueError(error_msg)
-
-        embedding = np_embedding.flatten().tolist()
-
-        return {"embedding": embedding}
diff --git a/haystack/components/generators/chat/__init__.py b/haystack/components/generators/chat/__init__.py
index 66a2bfa229..fbfe3af243 100644
--- a/haystack/components/generators/chat/__init__.py
+++ b/haystack/components/generators/chat/__init__.py
@@ -13,7 +13,6 @@
     "azure": ["AzureOpenAIChatGenerator"],
     "azure_responses": ["AzureOpenAIResponsesChatGenerator"],
     "hugging_face_local": ["HuggingFaceLocalChatGenerator"],
-    "hugging_face_api": ["HuggingFaceAPIChatGenerator"],
     "fallback": ["FallbackChatGenerator"],
     "llm": ["LLM"],
 }
diff --git a/haystack/components/generators/chat/hugging_face_api.py b/haystack/components/generators/chat/hugging_face_api.py
deleted file mode 100644
index 83ed222c4d..0000000000
--- a/haystack/components/generators/chat/hugging_face_api.py
+++ /dev/null
@@ -1,732 +0,0 @@
-# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import json
-from collections.abc import AsyncIterable, Iterable
-from datetime import datetime
-from typing import Any, Union
-
-from haystack import component, default_from_dict, default_to_dict, logging
-from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message, _normalize_messages
-from haystack.dataclasses import (
-    ChatMessage,
-    ComponentInfo,
-    ReasoningContent,
-    StreamingCallbackT,
-    StreamingChunk,
-    SyncStreamingCallbackT,
-    ToolCall,
-    select_streaming_callback,
-)
-from haystack.dataclasses.streaming_chunk import FinishReason, _invoke_streaming_callback
-from haystack.lazy_imports import LazyImport
-from haystack.tools import (
-    ToolsType,
-    _check_duplicate_tool_names,
-    deserialize_tools_or_toolset_inplace,
-    flatten_tools_or_toolsets,
-    serialize_tools_or_toolset,
-    warm_up_tools,
-)
-from haystack.utils import Secret, deserialize_callable, serialize_callable
-from haystack.utils.hf import HFGenerationAPIType, HFModelType, check_valid_model, convert_message_to_hf_format
-from haystack.utils.url_validation import is_valid_http_url
-
-logger = logging.getLogger(__name__)
-
-with LazyImport(message="Run 'pip install \"huggingface_hub[inference]>=0.27.0\"'") as huggingface_hub_import:
-    from huggingface_hub import (
-        AsyncInferenceClient,
-        ChatCompletionInputFunctionDefinition,
-        ChatCompletionInputStreamOptions,
-        ChatCompletionInputTool,
-        ChatCompletionOutput,
-        ChatCompletionOutputComplete,
-        ChatCompletionOutputToolCall,
-        ChatCompletionStreamOutput,
-        ChatCompletionStreamOutputChoice,
-        InferenceClient,
-    )
-
-
-def _convert_hfapi_tool_calls(hfapi_tool_calls: list["ChatCompletionOutputToolCall"] | None) -> list[ToolCall]:
-    """
-    Convert HuggingFace API tool calls to a list of Haystack ToolCall.
-
-    :param hfapi_tool_calls: The HuggingFace API tool calls to convert.
-    :returns: A list of ToolCall objects.
-
-    """
-    if not hfapi_tool_calls:
-        return []
-
-    tool_calls = []
-
-    for hfapi_tc in hfapi_tool_calls:
-        hf_arguments = hfapi_tc.function.arguments
-
-        arguments = None
-        if isinstance(hf_arguments, dict):
-            arguments = hf_arguments
-        elif isinstance(hf_arguments, str):
-            try:
-                arguments = json.loads(hf_arguments)
-            except json.JSONDecodeError:
-                logger.warning(
-                    "HuggingFace API returned a malformed JSON string for tool call arguments. This tool call "
-                    "will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
-                    _id=hfapi_tc.id,
-                    _name=hfapi_tc.function.name,
-                    _arguments=hf_arguments,
-                )
-        else:
-            logger.warning(
-                "HuggingFace API returned tool call arguments of type {_type}. Valid types are dict and str. This tool "
-                "call will be skipped. Tool call ID: {_id}, Tool name: {_name}, Arguments: {_arguments}",
-                _id=hfapi_tc.id,
-                _name=hfapi_tc.function.name,
-                _arguments=hf_arguments,
-            )
-
-        if arguments:
-            tool_calls.append(ToolCall(tool_name=hfapi_tc.function.name, arguments=arguments, id=hfapi_tc.id))
-
-    return tool_calls
-
-
-def _extract_reasoning_content(message_or_delta: Any) -> ReasoningContent | None:
-    """
-    Extract reasoning content from a HuggingFace API message or delta object.
-
-    :param message_or_delta: The HuggingFace message or delta object that may contain reasoning.
-    :returns: ReasoningContent if reasoning is present, None otherwise.
-    """
-    if hasattr(message_or_delta, "reasoning") and message_or_delta.reasoning:
-        return ReasoningContent(reasoning_text=message_or_delta.reasoning)
-    return None
-
-
-def _resolve_schema_refs(schema: dict[str, Any]) -> dict[str, Any]:
-    """
-    Resolve ``$ref`` references in a JSON schema by inlining ``$defs`` definitions.
-
-    The HuggingFace API does not support ``$defs`` and ``$ref`` in tool parameter schemas.
-    This function expands all ``$ref`` pointers and removes the ``$defs`` section.
-
-    :param schema: A JSON schema dict potentially containing ``$defs`` and ``$ref``.
-    :returns: A new schema dict with all references resolved and ``$defs`` removed.
-    """
-    defs = schema.get("$defs", {})
-    if not defs:
-        return schema
-
-    def _resolve(obj: Any, resolving: set[str] | None = None) -> Any:
-        if resolving is None:
-            resolving = set()
-        if isinstance(obj, dict):
-            if "$ref" in obj:
-                ref_path = obj["$ref"]
-                parts = ref_path.split("/")
-                if len(parts) == 3 and parts[0] == "#" and parts[1] == "$defs":
-                    def_name = parts[2]
-                    if def_name in defs and def_name not in resolving:
-                        return _resolve(defs[def_name], resolving | {def_name})
-            return {k: _resolve(v, resolving) for k, v in obj.items() if k != "$defs"}
-        if isinstance(obj, list):
-            return [_resolve(item, resolving) for item in obj]
-        return obj
-
-    return _resolve(schema)
-
-
-def _convert_tools_to_hfapi_tools(tools: ToolsType | None) -> list["ChatCompletionInputTool"] | None:
-    if not tools:
-        return None
-
-    # huggingface_hub<0.31.0 uses "arguments", huggingface_hub>=0.31.0 uses "parameters"
-    parameters_name = "arguments" if hasattr(ChatCompletionInputFunctionDefinition, "arguments") else "parameters"
-
-    hf_tools = []
-    for tool in flatten_tools_or_toolsets(tools):
-        hf_tools_args = {
-            "name": tool.name,
-            "description": tool.description,
-            parameters_name: _resolve_schema_refs(tool.parameters),
-        }
-
-        hf_tools.append(
-            ChatCompletionInputTool(
-                function=ChatCompletionInputFunctionDefinition(**hf_tools_args),  # type: ignore[arg-type]
-                type="function",
-            )
-        )
-
-    return hf_tools
-
-
-def _map_hf_finish_reason_to_haystack(
-    choice: Union["ChatCompletionStreamOutputChoice", "ChatCompletionOutputComplete"],
-) -> FinishReason | None:
-    """
-    Map HuggingFace finish reasons to Haystack FinishReason literals.
-
-    Uses the full choice object to detect tool calls and provide accurate mapping.
-
-    HuggingFace finish reasons (can be found here https://huggingface.github.io/text-generation-inference/ under
-    FinishReason):
-    - "length": number of generated tokens == `max_new_tokens`
-    - "eos_token": the model generated its end of sequence token
-    - "stop_sequence": the model generated a text included in `stop_sequences`
-
-    Additionally, detects tool calls from delta.tool_calls or delta.tool_call_id.
-
-    :param choice: The HuggingFace ChatCompletionStreamOutputChoice object.
-    :returns: The corresponding Haystack FinishReason or None.
-    """
-    if choice.finish_reason is None:
-        return None
-
-    # Check if this choice contains tool call information
-    if isinstance(choice, ChatCompletionStreamOutputChoice):
-        has_tool_calls = choice.delta.tool_calls is not None or choice.delta.tool_call_id is not None
-    else:
-        has_tool_calls = choice.message.tool_calls is not None or choice.message.tool_call_id is not None
-
-    # If we detect tool calls, override the finish reason
-    if has_tool_calls:
-        return "tool_calls"
-
-    # Map HuggingFace finish reasons to Haystack standard ones
-    mapping: dict[str, FinishReason] = {
-        "length": "length",  # Direct match
-        "eos_token": "stop",  # EOS token means natural stop
-        "stop_sequence": "stop",  # Stop sequence means natural stop
-    }
-
-    return mapping.get(choice.finish_reason, "stop")  # Default to "stop" for unknown reasons
-
-
-def _convert_chat_completion_stream_output_to_streaming_chunk(
-    chunk: "ChatCompletionStreamOutput",
-    previous_chunks: list[StreamingChunk],
-    component_info: ComponentInfo | None = None,
-) -> StreamingChunk:
-    """
-    Converts the Hugging Face API ChatCompletionStreamOutput to a StreamingChunk.
-    """
-    # Choices is empty if include_usage is set to True where the usage information is returned.
-    if len(chunk.choices) == 0:
-        usage = None
-        if chunk.usage:
-            usage = {"prompt_tokens": chunk.usage.prompt_tokens, "completion_tokens": chunk.usage.completion_tokens}
-        return StreamingChunk(
-            content="",
-            meta={"model": chunk.model, "received_at": datetime.now().isoformat(), "usage": usage},
-            component_info=component_info,
-        )
-
-    # n is unused, so the API always returns only one choice
-    # the argument is probably allowed for compatibility with OpenAI
-    # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
-    choice = chunk.choices[0]
-    mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None
-
-    # Extract reasoning content if present
-    reasoning = _extract_reasoning_content(choice.delta)
-
-    return StreamingChunk(
-        content=choice.delta.content or "",
-        meta={"model": chunk.model, "received_at": datetime.now().isoformat(), "finish_reason": choice.finish_reason},
-        component_info=component_info,
-        # Index must always be 0 since we don't allow tool calls in streaming mode.
-        index=0 if choice.finish_reason is None else None,
-        # start is True at the very beginning since first chunk contains role information + first part of the answer.
-        start=len(previous_chunks) == 0,
-        finish_reason=mapped_finish_reason,
-        reasoning=reasoning,
-    )
-
-
-@component
-class HuggingFaceAPIChatGenerator:
-    """
-    Completes chats using Hugging Face APIs.
-
-    HuggingFaceAPIChatGenerator uses the [ChatMessage](https://docs.haystack.deepset.ai/docs/chatmessage)
-    format for input and output. Use it to generate text with Hugging Face APIs:
-    - [Serverless Inference API (Inference Providers)](https://huggingface.co/docs/inference-providers)
-    - [Paid Inference Endpoints](https://huggingface.co/inference-endpoints)
-    - [Self-hosted Text Generation Inference](https://github.com/huggingface/text-generation-inference)
-
-    ### Usage examples
-
-    #### With the serverless inference API (Inference Providers) - free tier available
-    <!-- test-ignore -->
-    ```python
-    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
-    from haystack.dataclasses import ChatMessage
-    from haystack.utils import Secret
-    from haystack.utils.hf import HFGenerationAPIType
-
-    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
-                ChatMessage.from_user("What's Natural Language Processing?")]
-
-    # the api_type can be expressed using the HFGenerationAPIType enum or as a string
-    api_type = HFGenerationAPIType.SERVERLESS_INFERENCE_API
-    api_type = "serverless_inference_api" # this is equivalent to the above
-
-    generator = HuggingFaceAPIChatGenerator(api_type=api_type,
-                                            api_params={"model": "Qwen/Qwen2.5-7B-Instruct",
-                                                        "provider": "together"},
-                                            token=Secret.from_token("<your-api-key>"))
-
-    result = generator.run(messages)
-    print(result)
-    ```
-
-    #### With the serverless inference API (Inference Providers) and text+image input
-    <!-- test-ignore -->
-    ```python
-    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
-    from haystack.dataclasses import ChatMessage, ImageContent
-    from haystack.utils import Secret
-    from haystack.utils.hf import HFGenerationAPIType
-
-    # Create an image from file path, URL, or base64
-    image = ImageContent.from_file_path("path/to/your/image.jpg")
-
-    # Create a multimodal message with both text and image
-    messages = [ChatMessage.from_user(content_parts=["Describe this image in detail", image])]
-
-    generator = HuggingFaceAPIChatGenerator(
-        api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-        api_params={
-            "model": "Qwen/Qwen2.5-VL-7B-Instruct",  # Vision Language Model
-            "provider": "hyperbolic"
-        },
-        token=Secret.from_token("<your-api-key>")
-    )
-
-    result = generator.run(messages)
-    print(result)
-    ```
-
-    #### With paid inference endpoints
-    <!-- test-ignore -->
-    ```python
-    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
-    from haystack.dataclasses import ChatMessage
-    from haystack.utils import Secret
-
-    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
-                ChatMessage.from_user("What's Natural Language Processing?")]
-
-    generator = HuggingFaceAPIChatGenerator(api_type="inference_endpoints",
-                                            api_params={"url": "<your-inference-endpoint-url>"},
-                                            token=Secret.from_token("<your-api-key>"))
-
-    result = generator.run(messages)
-    print(result)
-    ```
-
-    #### With self-hosted text generation inference
-    <!-- test-ignore -->
-    ```python
-    from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
-    from haystack.dataclasses import ChatMessage
-
-    messages = [ChatMessage.from_system("\\nYou are a helpful, respectful and honest assistant"),
-                ChatMessage.from_user("What's Natural Language Processing?")]
-
-    generator = HuggingFaceAPIChatGenerator(api_type="text_generation_inference",
-                                            api_params={"url": "http://localhost:8080"})
-
-    result = generator.run(messages)
-    print(result)
-    ```
-    """
-
-    def __init__(
-        self,
-        api_type: HFGenerationAPIType | str,
-        api_params: dict[str, str],
-        token: Secret | None = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
-        generation_kwargs: dict[str, Any] | None = None,
-        stop_words: list[str] | None = None,
-        streaming_callback: StreamingCallbackT | None = None,
-        tools: ToolsType | None = None,
-    ) -> None:
-        """
-        Initialize the HuggingFaceAPIChatGenerator instance.
-
-        :param api_type:
-            The type of Hugging Face API to use. Available types:
-            - `text_generation_inference`: See [TGI](https://github.com/huggingface/text-generation-inference).
-            - `inference_endpoints`: See [Inference Endpoints](https://huggingface.co/inference-endpoints).
-            - `serverless_inference_api`: See
-            [Serverless Inference API - Inference Providers](https://huggingface.co/docs/inference-providers).
-        :param api_params:
-            A dictionary with the following keys:
-            - `model`: Hugging Face model ID. Required when `api_type` is `SERVERLESS_INFERENCE_API`.
-            - `provider`: Provider name. Recommended when `api_type` is `SERVERLESS_INFERENCE_API`.
-            - `url`: URL of the inference endpoint. Required when `api_type` is `INFERENCE_ENDPOINTS` or
-            `TEXT_GENERATION_INFERENCE`.
-            - Other parameters specific to the chosen API type, such as `timeout`, `headers`, etc.
-        :param token:
-            The Hugging Face token to use as HTTP bearer authorization.
-            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
-        :param generation_kwargs:
-            A dictionary with keyword arguments to customize text generation.
-                Some examples: `max_tokens`, `temperature`, `top_p`.
-                For details, see [Hugging Face chat_completion documentation](https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion).
-        :param stop_words:
-            An optional list of strings representing the stop words.
-        :param streaming_callback:
-            An optional callable for handling streaming responses.
-        :param tools:
-            A list of Tool and/or Toolset objects, or a single Toolset for which the model can prepare calls.
-            The chosen model should support tool/function calling, according to the model card.
-            Support for tools in the Hugging Face API and TGI is not yet fully refined and you may experience
-            unexpected behavior.
-        """
-
-        huggingface_hub_import.check()
-
-        if isinstance(api_type, str):
-            api_type = HFGenerationAPIType.from_str(api_type)
-
-        if api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API:
-            model = api_params.get("model")
-            if model is None:
-                raise ValueError(
-                    "To use the Serverless Inference API, you need to specify the `model` parameter in `api_params`."
-                )
-            check_valid_model(model, HFModelType.GENERATION, token)
-            model_or_url = model
-        elif api_type in [HFGenerationAPIType.INFERENCE_ENDPOINTS, HFGenerationAPIType.TEXT_GENERATION_INFERENCE]:
-            url = api_params.get("url")
-            if url is None:
-                msg = (
-                    "To use Text Generation Inference or Inference Endpoints, you need to specify the `url` parameter "
-                    "in `api_params`."
-                )
-                raise ValueError(msg)
-            if not is_valid_http_url(url):
-                raise ValueError(f"Invalid URL: {url}")
-            model_or_url = url
-        else:
-            msg = f"Unknown api_type {api_type}"
-            raise ValueError(msg)
-
-        if tools and streaming_callback is not None:
-            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
-        _check_duplicate_tool_names(flatten_tools_or_toolsets(tools))
-
-        # handle generation kwargs setup
-        generation_kwargs = generation_kwargs.copy() if generation_kwargs else {}
-        generation_kwargs["stop"] = generation_kwargs.get("stop", [])
-        generation_kwargs["stop"].extend(stop_words or [])
-        generation_kwargs.setdefault("max_tokens", 512)
-
-        self.api_type = api_type
-        self.api_params = api_params
-        self.token = token
-        self.generation_kwargs = generation_kwargs
-        self.streaming_callback = streaming_callback
-
-        resolved_api_params: dict[str, Any] = {k: v for k, v in api_params.items() if k != "model" and k != "url"}
-        self._client = InferenceClient(
-            model_or_url, token=token.resolve_value() if token else None, **resolved_api_params
-        )
-        self._async_client = AsyncInferenceClient(
-            model_or_url, token=token.resolve_value() if token else None, **resolved_api_params
-        )
-        self.tools = tools
-        self._is_warmed_up = False
-
-    def warm_up(self) -> None:
-        """
-        Warm up the Hugging Face API chat generator.
-
-        This will warm up the tools registered in the chat generator.
-        This method is idempotent and will only warm up the tools once.
-        """
-        if not self._is_warmed_up:
-            warm_up_tools(self.tools)
-            self._is_warmed_up = True
-
-    def to_dict(self) -> dict[str, Any]:
-        """
-        Serialize this component to a dictionary.
-
-        :returns:
-            A dictionary containing the serialized component.
-        """
-        callback_name = serialize_callable(self.streaming_callback) if self.streaming_callback else None
-        return default_to_dict(
-            self,
-            api_type=str(self.api_type),
-            api_params=self.api_params,
-            token=self.token,
-            generation_kwargs=self.generation_kwargs,
-            streaming_callback=callback_name,
-            tools=serialize_tools_or_toolset(self.tools),
-        )
-
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> "HuggingFaceAPIChatGenerator":
-        """
-        Deserialize this component from a dictionary.
-        """
-        deserialize_tools_or_toolset_inplace(data["init_parameters"], key="tools")
-        init_params = data.get("init_parameters", {})
-        serialized_callback_handler = init_params.get("streaming_callback")
-        if serialized_callback_handler:
-            data["init_parameters"]["streaming_callback"] = deserialize_callable(serialized_callback_handler)
-        return default_from_dict(cls, data)
-
-    @component.output_types(replies=list[ChatMessage])
-    def run(
-        self,
-        messages: list[ChatMessage] | str,
-        generation_kwargs: dict[str, Any] | None = None,
-        tools: ToolsType | None = None,
-        streaming_callback: StreamingCallbackT | None = None,
-    ) -> dict[str, list[ChatMessage]]:
-        """
-        Invoke the text generation inference based on the provided messages and generation parameters.
-
-        :param messages:
-            A list of ChatMessage objects representing the input messages. If a string is provided, it is converted
-            to a list containing a ChatMessage with user role.
-        :param generation_kwargs:
-            Additional keyword arguments for text generation.
-        :param tools:
-            A list of tools or a Toolset for which the model can prepare calls. If set, it will override
-            the `tools` parameter set during component initialization. This parameter can accept either a
-            list of `Tool` objects or a `Toolset` instance.
-        :param streaming_callback:
-            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
-            parameter set during component initialization.
-        :returns: A dictionary with the following keys:
-            - `replies`: A list containing the generated responses as ChatMessage objects.
-        """
-        if not self._is_warmed_up:
-            self.warm_up()
-
-        messages = _normalize_messages(messages)
-
-        # update generation kwargs by merging with the default ones
-        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
-
-        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
-
-        tools = tools or self.tools
-        if tools and self.streaming_callback:
-            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
-        flat_tools = flatten_tools_or_toolsets(tools)
-        _check_duplicate_tool_names(flat_tools)
-
-        # validate and select the streaming callback
-        streaming_callback = select_streaming_callback(
-            self.streaming_callback, streaming_callback, requires_async=False
-        )
-
-        if streaming_callback:
-            return self._run_streaming(formatted_messages, generation_kwargs, streaming_callback)
-
-        hf_tools = _convert_tools_to_hfapi_tools(tools)
-
-        return self._run_non_streaming(formatted_messages, generation_kwargs, hf_tools)
-
-    @component.output_types(replies=list[ChatMessage])
-    async def run_async(
-        self,
-        messages: list[ChatMessage] | str,
-        generation_kwargs: dict[str, Any] | None = None,
-        tools: ToolsType | None = None,
-        streaming_callback: StreamingCallbackT | None = None,
-    ) -> dict[str, list[ChatMessage]]:
-        """
-        Asynchronously invokes the text generation inference based on the provided messages and generation parameters.
-
-        This is the asynchronous version of the `run` method. It has the same parameters
-        and return values but can be used with `await` in an async code.
-
-        :param messages:
-            A list of ChatMessage objects representing the input messages. If a string is provided, it is converted
-            to a list containing a ChatMessage with user role.
-        :param generation_kwargs:
-            Additional keyword arguments for text generation.
-        :param tools:
-            A list of tools or a Toolset for which the model can prepare calls. If set, it will override the `tools`
-            parameter set during component initialization. This parameter can accept either a list of `Tool` objects
-            or a `Toolset` instance.
-        :param streaming_callback:
-            An optional callable for handling streaming responses. If set, it will override the `streaming_callback`
-            parameter set during component initialization.
-        :returns: A dictionary with the following keys:
-            - `replies`: A list containing the generated responses as ChatMessage objects.
-        """
-        if not self._is_warmed_up:
-            self.warm_up()
-
-        messages = _normalize_messages(messages)
-
-        # update generation kwargs by merging with the default ones
-        generation_kwargs = {**self.generation_kwargs, **(generation_kwargs or {})}
-
-        formatted_messages = [convert_message_to_hf_format(message) for message in messages]
-
-        tools = tools or self.tools
-        if tools and self.streaming_callback:
-            raise ValueError("Using tools and streaming at the same time is not supported. Please choose one.")
-        flat_tools = flatten_tools_or_toolsets(tools)
-        _check_duplicate_tool_names(flat_tools)
-
-        # validate and select the streaming callback
-        streaming_callback = select_streaming_callback(self.streaming_callback, streaming_callback, requires_async=True)
-
-        if streaming_callback:
-            return await self._run_streaming_async(formatted_messages, generation_kwargs, streaming_callback)
-
-        hf_tools = _convert_tools_to_hfapi_tools(tools)
-
-        return await self._run_non_streaming_async(formatted_messages, generation_kwargs, hf_tools)
-
-    def _run_streaming(
-        self,
-        messages: list[dict[str, str]],
-        generation_kwargs: dict[str, Any],
-        streaming_callback: SyncStreamingCallbackT,
-    ) -> dict[str, list[ChatMessage]]:
-        api_output: Iterable[ChatCompletionStreamOutput] = self._client.chat_completion(
-            messages,
-            stream=True,
-            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
-            **generation_kwargs,
-        )
-
-        component_info = ComponentInfo.from_component(self)
-        streaming_chunks: list[StreamingChunk] = []
-        for chunk in api_output:
-            streaming_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(
-                chunk=chunk, previous_chunks=streaming_chunks, component_info=component_info
-            )
-            streaming_chunks.append(streaming_chunk)
-            streaming_callback(streaming_chunk)
-
-        message = _convert_streaming_chunks_to_chat_message(chunks=streaming_chunks)
-        if message.meta.get("usage") is None:
-            message.meta["usage"] = {"prompt_tokens": 0, "completion_tokens": 0}
-
-        return {"replies": [message]}
-
-    def _run_non_streaming(
-        self,
-        messages: list[dict[str, str]],
-        generation_kwargs: dict[str, Any],
-        tools: list["ChatCompletionInputTool"] | None = None,
-    ) -> dict[str, list[ChatMessage]]:
-        api_chat_output: ChatCompletionOutput = self._client.chat_completion(
-            messages=messages, tools=tools, **generation_kwargs
-        )
-
-        if api_chat_output.choices is None or len(api_chat_output.choices) == 0:
-            return {"replies": []}
-
-        # n is unused, so the API always returns only one choice
-        # the argument is probably allowed for compatibility with OpenAI
-        # see https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.n
-        choice = api_chat_output.choices[0]
-
-        text = choice.message.content
-
-        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
-
-        # Extract reasoning content if present
-        reasoning = _extract_reasoning_content(choice.message)
-
-        mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None
-        meta: dict[str, Any] = {
-            "model": self._client.model,
-            "finish_reason": mapped_finish_reason,
-            "index": choice.index,
-        }
-
-        usage = {"prompt_tokens": 0, "completion_tokens": 0}
-        if api_chat_output.usage:
-            usage = {
-                "prompt_tokens": api_chat_output.usage.prompt_tokens,
-                "completion_tokens": api_chat_output.usage.completion_tokens,
-            }
-        meta["usage"] = usage
-
-        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, reasoning=reasoning, meta=meta)
-        return {"replies": [message]}
-
-    async def _run_streaming_async(
-        self, messages: list[dict[str, str]], generation_kwargs: dict[str, Any], streaming_callback: StreamingCallbackT
-    ) -> dict[str, list[ChatMessage]]:
-        api_output: AsyncIterable[ChatCompletionStreamOutput] = await self._async_client.chat_completion(
-            messages,
-            stream=True,
-            stream_options=ChatCompletionInputStreamOptions(include_usage=True),
-            **generation_kwargs,
-        )
-
-        component_info = ComponentInfo.from_component(self)
-        streaming_chunks: list[StreamingChunk] = []
-        async for chunk in api_output:
-            stream_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(
-                chunk=chunk, previous_chunks=streaming_chunks, component_info=component_info
-            )
-            streaming_chunks.append(stream_chunk)
-            await _invoke_streaming_callback(streaming_callback, stream_chunk)
-
-        message = _convert_streaming_chunks_to_chat_message(chunks=streaming_chunks)
-        if message.meta.get("usage") is None:
-            message.meta["usage"] = {"prompt_tokens": 0, "completion_tokens": 0}
-
-        return {"replies": [message]}
-
-    async def _run_non_streaming_async(
-        self,
-        messages: list[dict[str, str]],
-        generation_kwargs: dict[str, Any],
-        tools: list["ChatCompletionInputTool"] | None = None,
-    ) -> dict[str, list[ChatMessage]]:
-        api_chat_output: ChatCompletionOutput = await self._async_client.chat_completion(
-            messages=messages, tools=tools, **generation_kwargs
-        )
-
-        if api_chat_output.choices is None or len(api_chat_output.choices) == 0:
-            return {"replies": []}
-
-        choice = api_chat_output.choices[0]
-
-        text = choice.message.content
-
-        tool_calls = _convert_hfapi_tool_calls(choice.message.tool_calls)
-
-        # Extract reasoning content if present
-        reasoning = _extract_reasoning_content(choice.message)
-
-        mapped_finish_reason = _map_hf_finish_reason_to_haystack(choice) if choice.finish_reason else None
-        meta: dict[str, Any] = {
-            "model": self._async_client.model,
-            "finish_reason": mapped_finish_reason,
-            "index": choice.index,
-        }
-
-        usage = {"prompt_tokens": 0, "completion_tokens": 0}
-        if api_chat_output.usage:
-            usage = {
-                "prompt_tokens": api_chat_output.usage.prompt_tokens,
-                "completion_tokens": api_chat_output.usage.completion_tokens,
-            }
-        meta["usage"] = usage
-
-        message = ChatMessage.from_assistant(text=text, tool_calls=tool_calls, reasoning=reasoning, meta=meta)
-        return {"replies": [message]}
diff --git a/haystack/components/rankers/__init__.py b/haystack/components/rankers/__init__.py
index 7960d70384..7134043f8d 100644
--- a/haystack/components/rankers/__init__.py
+++ b/haystack/components/rankers/__init__.py
@@ -8,7 +8,6 @@
 from lazy_imports import LazyImporter
 
 _import_structure = {
-    "hugging_face_tei": ["HuggingFaceTEIRanker"],
     "llm_ranker": ["LLMRanker"],
     "lost_in_the_middle": ["LostInTheMiddleRanker"],
     "meta_field": ["MetaFieldRanker"],
@@ -19,7 +18,6 @@
 }
 
 if TYPE_CHECKING:
-    from .hugging_face_tei import HuggingFaceTEIRanker as HuggingFaceTEIRanker
     from .llm_ranker import LLMRanker as LLMRanker
     from .lost_in_the_middle import LostInTheMiddleRanker as LostInTheMiddleRanker
     from .meta_field import MetaFieldRanker as MetaFieldRanker
diff --git a/haystack/components/rankers/hugging_face_tei.py b/haystack/components/rankers/hugging_face_tei.py
deleted file mode 100644
index 5e5957854f..0000000000
--- a/haystack/components/rankers/hugging_face_tei.py
+++ /dev/null
@@ -1,293 +0,0 @@
-# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from dataclasses import replace
-from enum import Enum
-from typing import Any
-from urllib.parse import urljoin
-
-import httpx
-
-from haystack import Document, component, default_from_dict, default_to_dict
-from haystack.utils import Secret
-from haystack.utils.misc import _deduplicate_documents
-from haystack.utils.requests_utils import async_request_with_retry, request_with_retry
-
-
-class TruncationDirection(str, Enum):
-    """
-    Defines the direction to truncate text when input length exceeds the model's limit.
-
-    Attributes:
-        LEFT: Truncate text from the left side (start of text).
-        RIGHT: Truncate text from the right side (end of text).
-    """
-
-    LEFT = "Left"
-    RIGHT = "Right"
-
-
-@component
-class HuggingFaceTEIRanker:
-    """
-    Ranks documents based on their semantic similarity to the query.
-
-    It can be used with a Text Embeddings Inference (TEI) API endpoint:
-    - [Self-hosted Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference)
-    - [Hugging Face Inference Endpoints](https://huggingface.co/inference-endpoints)
-
-    Usage example:
-    <!-- test-ignore -->
-    ```python
-    from haystack import Document
-    from haystack.components.rankers import HuggingFaceTEIRanker
-    from haystack.utils import Secret
-
-    reranker = HuggingFaceTEIRanker(
-        url="http://localhost:8080",
-        top_k=5,
-        timeout=30,
-        token=Secret.from_token("my_api_token")
-    )
-
-    docs = [Document(content="The capital of France is Paris"), Document(content="The capital of Germany is Berlin")]
-
-    result = reranker.run(query="What is the capital of France?", documents=docs)
-
-    ranked_docs = result["documents"]
-    print(ranked_docs)
-    # >> {'documents': [Document(id=..., content: 'the capital of France is Paris', score: 0.9979767),
-    # >>                Document(id=..., content: 'the capital of Germany is Berlin', score: 0.13982213)]}
-    ```
-    """
-
-    def __init__(
-        self,
-        *,
-        url: str,
-        top_k: int = 10,
-        raw_scores: bool = False,
-        timeout: int | None = 30,
-        max_retries: int = 3,
-        retry_status_codes: list[int] | None = None,
-        token: Secret | None = Secret.from_env_var(["HF_API_TOKEN", "HF_TOKEN"], strict=False),
-    ) -> None:
-        """
-        Initializes the TEI reranker component.
-
-        :param url: Base URL of the TEI reranking service (for example, "https://api.example.com").
-        :param top_k: Maximum number of top documents to return.
-        :param raw_scores: If True, include raw relevance scores in the API payload.
-        :param timeout: Request timeout in seconds.
-        :param max_retries: Maximum number of retry attempts for failed requests.
-        :param retry_status_codes: List of HTTP status codes that will trigger a retry.
-            When None, HTTP 408, 418, 429 and 503 will be retried (default: None).
-        :param token: The Hugging Face token to use as HTTP bearer authorization. Not always required
-            depending on your TEI server configuration.
-            Check your HF token in your [account settings](https://huggingface.co/settings/tokens).
-        """
-        self.url = url
-        self.top_k = top_k
-        self.timeout = timeout
-        self.token = token
-        self.max_retries = max_retries
-        self.retry_status_codes = retry_status_codes
-        self.raw_scores = raw_scores
-
-    def to_dict(self) -> dict[str, Any]:
-        """
-        Serializes the component to a dictionary.
-
-        :returns:
-            Dictionary with serialized data.
-        """
-        return default_to_dict(
-            self,
-            url=self.url,
-            top_k=self.top_k,
-            timeout=self.timeout,
-            token=self.token,
-            max_retries=self.max_retries,
-            retry_status_codes=self.retry_status_codes,
-        )
-
-    @classmethod
-    def from_dict(cls, data: dict[str, Any]) -> "HuggingFaceTEIRanker":
-        """
-        Deserializes the component from a dictionary.
-
-        :param data:
-            Dictionary to deserialize from.
-        :returns:
-            Deserialized component.
-        """
-        return default_from_dict(cls, data)
-
-    def _compose_response(
-        self, result: dict[str, str] | list[dict[str, Any]], top_k: int | None, documents: list[Document]
-    ) -> dict[str, list[Document]]:
-        """
-        Processes the API response into a structured format.
-
-        :param result: The raw response from the API.
-
-        :returns: A dictionary with the following keys:
-            - `documents`: A list of reranked documents.
-
-        :raises RuntimeError:
-            - If the API request fails.
-
-        :raises RuntimeError:
-            - If the API returns an error response.
-
-        :raises TypeError:
-            - If the API response is not in the expected list format.
-        """
-        if isinstance(result, dict) and "error" in result:
-            error_type = result.get("error_type", "UnknownError")
-            error_msg = result.get("error", "No additional information.")
-            raise RuntimeError(f"HuggingFaceTEIRanker API call failed ({error_type}): {error_msg}")
-
-        # Ensure we have a list of score dicts
-        if not isinstance(result, list):
-            # Expected list or dict, but encountered an unknown response format.
-            error_msg = f"Expected a list of score dictionaries, but got `{type(result).__name__}`. "
-            error_msg += f"Response content: {result}"
-            raise TypeError(f"Unexpected response format from text-embeddings-inference rerank API: {error_msg}")
-
-        # Determine number of docs to return
-        final_k = min(top_k or self.top_k, len(result))
-
-        # Select and return the top_k documents
-        ranked_docs = []
-        for item in result[:final_k]:
-            index: int = item["index"]
-            ranked_docs.append(replace(documents[index], score=item["score"]))
-        return {"documents": ranked_docs}
-
-    @component.output_types(documents=list[Document])
-    def run(
-        self,
-        query: str,
-        documents: list[Document],
-        top_k: int | None = None,
-        truncation_direction: TruncationDirection | None = None,
-    ) -> dict[str, list[Document]]:
-        """
-        Reranks the provided documents by relevance to the query using the TEI API.
-
-        Before ranking, documents are deduplicated by their id, retaining only the document with the highest score
-        if a score is present.
-
-        :param query: The user query string to guide reranking.
-        :param documents: List of `Document` objects to rerank.
-        :param top_k: Optional override for the maximum number of documents to return.
-        :param truncation_direction: If set, enables text truncation in the specified direction.
-
-        :returns: A dictionary with the following keys:
-            - `documents`: A list of reranked documents.
-
-        :raises RuntimeError:
-            - If the API request fails.
-
-        :raises RuntimeError:
-            - If the API returns an error response.
-
-        :raises TypeError:
-            - If the API response is not in the expected list format.
-        """
-        # Return empty if no documents provided
-        if not documents:
-            return {"documents": []}
-
-        # Prepare the payload
-        deduplicated_documents = _deduplicate_documents(documents)
-        texts = [doc.content for doc in deduplicated_documents]
-        payload: dict[str, Any] = {"query": query, "texts": texts, "raw_scores": self.raw_scores}
-        if truncation_direction:
-            payload.update({"truncate": True, "truncation_direction": truncation_direction.value})
-
-        headers = {}
-        if self.token and self.token.resolve_value():
-            headers["Authorization"] = f"Bearer {self.token.resolve_value()}"
-
-        # Call the external service with retry
-        try:
-            response = request_with_retry(
-                method="POST",
-                url=urljoin(self.url, "/rerank"),
-                json=payload,
-                timeout=self.timeout,
-                headers=headers,
-                attempts=self.max_retries,
-                status_codes_to_retry=self.retry_status_codes,
-            )
-        except httpx.HTTPStatusError as e:
-            raise RuntimeError(f"HuggingFaceTEIRanker API call failed. Error: {e}, Response: {e.response.text}") from e
-
-        result: dict[str, str] | list[dict[str, Any]] = response.json()
-
-        return self._compose_response(result, top_k, deduplicated_documents)
-
-    @component.output_types(documents=list[Document])
-    async def run_async(
-        self,
-        query: str,
-        documents: list[Document],
-        top_k: int | None = None,
-        truncation_direction: TruncationDirection | None = None,
-    ) -> dict[str, list[Document]]:
-        """
-        Asynchronously reranks the provided documents by relevance to the query using the TEI API.
-
-        Before ranking, documents are deduplicated by their id, retaining only the document with the highest score
-        if a score is present.
-
-        :param query: The user query string to guide reranking.
-        :param documents: List of `Document` objects to rerank.
-        :param top_k: Optional override for the maximum number of documents to return.
-        :param truncation_direction: If set, enables text truncation in the specified direction.
-
-        :returns: A dictionary with the following keys:
-            - `documents`: A list of reranked documents.
-
-        :raises httpx.RequestError:
-            - If the API request fails.
-        :raises RuntimeError:
-            - If the API returns an error response.
-        :raises TypeError:
-            - If the API response is not in the expected list format.
-        """
-        # Return empty if no documents provided
-        if not documents:
-            return {"documents": []}
-
-        # Prepare the payload
-        deduplicated_documents = _deduplicate_documents(documents)
-        texts = [doc.content for doc in deduplicated_documents]
-        payload: dict[str, Any] = {"query": query, "texts": texts, "raw_scores": self.raw_scores}
-        if truncation_direction:
-            payload.update({"truncate": True, "truncation_direction": truncation_direction.value})
-
-        headers = {}
-        if self.token and self.token.resolve_value():
-            headers["Authorization"] = f"Bearer {self.token.resolve_value()}"
-
-        # Call the external service with retry
-        try:
-            response = await async_request_with_retry(
-                method="POST",
-                url=urljoin(self.url, "/rerank"),
-                json=payload,
-                timeout=self.timeout,
-                headers=headers,
-                attempts=self.max_retries,
-                status_codes_to_retry=self.retry_status_codes,
-            )
-        except httpx.HTTPStatusError as e:
-            raise RuntimeError(f"HuggingFaceTEIRanker API call failed. Error: {e}, Response: {e.response.text}") from e
-
-        result: dict[str, str] | list[dict[str, Any]] = response.json()
-
-        return self._compose_response(result, top_k, deduplicated_documents)
diff --git a/haystack/utils/hf.py b/haystack/utils/hf.py
index c4bcf2a8e9..faa2606a36 100644
--- a/haystack/utils/hf.py
+++ b/haystack/utils/hf.py
@@ -4,7 +4,6 @@
 
 import asyncio
 import copy
-from enum import Enum
 from typing import Any
 
 from haystack import logging
@@ -27,84 +26,11 @@
     import torch
 
 with LazyImport(message="Run 'pip install \"huggingface_hub>=0.27.0\"'") as huggingface_hub_import:
-    from huggingface_hub import HfApi, model_info
-    from huggingface_hub.utils import RepositoryNotFoundError
+    from huggingface_hub import model_info
 
 logger = logging.getLogger(__name__)
 
 
-class HFGenerationAPIType(Enum):
-    """
-    API type to use for Hugging Face API Generators.
-    """
-
-    # HF [Text Generation Inference (TGI)](https://github.com/huggingface/text-generation-inference).
-    TEXT_GENERATION_INFERENCE = "text_generation_inference"
-
-    # HF [Inference Endpoints](https://huggingface.co/inference-endpoints).
-    INFERENCE_ENDPOINTS = "inference_endpoints"
-
-    # HF [Serverless Inference API](https://huggingface.co/inference-api).
-    SERVERLESS_INFERENCE_API = "serverless_inference_api"
-
-    def __str__(self) -> str:
-        return self.value
-
-    @staticmethod
-    def from_str(string: str) -> "HFGenerationAPIType":
-        """
-        Convert a string to a HFGenerationAPIType enum.
-
-        :param string: The string to convert.
-        :return: The corresponding HFGenerationAPIType enum.
-
-        """
-        enum_map = {e.value: e for e in HFGenerationAPIType}
-        mode = enum_map.get(string)
-        if mode is None:
-            msg = f"Unknown Hugging Face API type '{string}'. Supported types are: {list(enum_map.keys())}"
-            raise ValueError(msg)
-        return mode
-
-
-class HFEmbeddingAPIType(Enum):
-    """
-    API type to use for Hugging Face API Embedders.
-    """
-
-    # HF [Text Embeddings Inference (TEI)](https://github.com/huggingface/text-embeddings-inference).
-    TEXT_EMBEDDINGS_INFERENCE = "text_embeddings_inference"
-
-    # HF [Inference Endpoints](https://huggingface.co/inference-endpoints).
-    INFERENCE_ENDPOINTS = "inference_endpoints"
-
-    # HF [Serverless Inference API](https://huggingface.co/inference-api).
-    SERVERLESS_INFERENCE_API = "serverless_inference_api"
-
-    def __str__(self) -> str:
-        return self.value
-
-    @staticmethod
-    def from_str(string: str) -> "HFEmbeddingAPIType":
-        """
-        Convert a string to a HFEmbeddingAPIType enum.
-
-        :param string:
-        :return: The corresponding HFEmbeddingAPIType enum.
-        """
-        enum_map = {e.value: e for e in HFEmbeddingAPIType}
-        mode = enum_map.get(string)
-        if mode is None:
-            msg = f"Unknown Hugging Face API type '{string}'. Supported types are: {list(enum_map.keys())}"
-            raise ValueError(msg)
-        return mode
-
-
-class HFModelType(Enum):
-    EMBEDDING = 1
-    GENERATION = 2
-
-
 def serialize_hf_model_kwargs(kwargs: dict[str, Any]) -> None:
     """
     Recursively serialize HuggingFace specific model keyword arguments in-place to make them JSON serializable.
@@ -220,41 +146,6 @@ def resolve_hf_pipeline_kwargs(
     return huggingface_pipeline_kwargs
 
 
-def check_valid_model(model_id: str, model_type: HFModelType, token: Secret | None) -> None:
-    """
-    Check if the provided model ID corresponds to a valid model on HuggingFace Hub.
-
-    Also check if the model is an embedding or generation model.
-
-    :param model_id: A string representing the HuggingFace model ID.
-    :param model_type: the model type, HFModelType.EMBEDDING or HFModelType.GENERATION
-    :param token: The optional authentication token.
-    :raises ValueError: If the model is not found or is not a embedding model.
-    """
-    huggingface_hub_import.check()
-
-    api = HfApi()
-    try:
-        model_info = api.model_info(model_id, token=token.resolve_value() if token else None)
-    except RepositoryNotFoundError as e:
-        raise ValueError(
-            f"Model {model_id} not found on HuggingFace Hub. Please provide a valid HuggingFace model_id."
-        ) from e
-
-    if model_type == HFModelType.EMBEDDING:
-        allowed_model = model_info.pipeline_tag in ["sentence-similarity", "feature-extraction"]
-        error_msg = f"Model {model_id} is not a embedding model. Please provide a embedding model."
-    elif model_type == HFModelType.GENERATION:
-        allowed_model = model_info.pipeline_tag in ["text-generation", "text2text-generation", "image-text-to-text"]
-        error_msg = f"Model {model_id} is not a text generation model. Please provide a text generation model."
-    else:
-        allowed_model = False
-        error_msg = f"Unknown model type for {model_id}"
-
-    if not allowed_model:
-        raise ValueError(error_msg)
-
-
 def convert_message_to_hf_format(message: ChatMessage) -> dict[str, Any]:
     """
     Convert a message to the format expected by Hugging Face.
diff --git a/pydoc/embedders_api.yml b/pydoc/embedders_api.yml
index d97773efef..4533d36049 100644
--- a/pydoc/embedders_api.yml
+++ b/pydoc/embedders_api.yml
@@ -1,10 +1,17 @@
 loaders:
   - search_path: [../haystack/components/embedders]
-    modules: ["azure_document_embedder", "azure_text_embedder", "hugging_face_api_document_embedder",
-      "hugging_face_api_text_embedder", "openai_document_embedder", "openai_text_embedder",
-      "sentence_transformers_document_embedder", "sentence_transformers_text_embedder",
-      "sentence_transformers_sparse_document_embedder", "sentence_transformers_sparse_text_embedder",
-      "image/sentence_transformers_doc_image_embedder"]
+    modules:
+      [
+        "azure_document_embedder",
+        "azure_text_embedder",
+        "openai_document_embedder",
+        "openai_text_embedder",
+        "sentence_transformers_document_embedder",
+        "sentence_transformers_text_embedder",
+        "sentence_transformers_sparse_document_embedder",
+        "sentence_transformers_sparse_text_embedder",
+        "image/sentence_transformers_doc_image_embedder",
+      ]
 processors:
   - type: filter
     documented_only: true
diff --git a/pydoc/generators_api.yml b/pydoc/generators_api.yml
index 198d2d3120..20af90637f 100644
--- a/pydoc/generators_api.yml
+++ b/pydoc/generators_api.yml
@@ -5,7 +5,6 @@ loaders:
         "chat/azure",
         "chat/azure_responses",
         "chat/fallback",
-        "chat/hugging_face_api",
         "chat/hugging_face_local",
         "chat/llm",
         "chat/openai",
diff --git a/pydoc/rankers_api.yml b/pydoc/rankers_api.yml
index a70d4ec823..d4b5015f95 100644
--- a/pydoc/rankers_api.yml
+++ b/pydoc/rankers_api.yml
@@ -1,6 +1,6 @@
 loaders:
   - search_path: [../haystack/components/rankers]
-    modules: ["hugging_face_tei", "llm_ranker", "lost_in_the_middle", "meta_field", "meta_field_grouping_ranker",
+    modules: ["llm_ranker", "lost_in_the_middle", "meta_field", "meta_field_grouping_ranker",
       "sentence_transformers_diversity", "sentence_transformers_similarity", "transformers_similarity"]
 processors:
   - type: filter
diff --git a/pyproject.toml b/pyproject.toml
index 888e9a770c..579435a89f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -93,7 +93,6 @@ dependencies = [
                                                       # EvaluationRunResult, XLSXToDocument, and pipeline tests
 
   "transformers[torch, sentencepiece]>=4.57",         # ExtractiveReader, TransformersSimilarityRanker, LocalWhisperTranscriber, HFGenerators...
-  "huggingface_hub>=0.27.0",                          # Hugging Face API Generators and Embedders
   "sentence-transformers>=5.0.0",                     # Sentence Transformers Embedders, Rankers, and SASEvaluator
   "langdetect",                                       # TextLanguageRouter and DocumentLanguageClassifier
   "openai-whisper>=20231106",                         # LocalWhisperTranscriber
diff --git a/releasenotes/notes/remove-hf-api-components-97bb895a321bc3ab.yaml b/releasenotes/notes/remove-hf-api-components-97bb895a321bc3ab.yaml
new file mode 100644
index 0000000000..a0ee623c88
--- /dev/null
+++ b/releasenotes/notes/remove-hf-api-components-97bb895a321bc3ab.yaml
@@ -0,0 +1,25 @@
+---
+upgrade:
+  - |
+    ``HuggingFaceAPIChatGenerator``, ``HuggingFaceAPITextEmbedder``, ``HuggingFaceAPIDocumentEmbedder``, and
+    ``HuggingFaceTEIRanker`` have been moved out of Haystack into the ``huggingface-api-haystack`` integration
+    package. Install the new package with ``pip install huggingface-api-haystack`` and update your imports.
+
+    Before:
+
+    .. code:: python
+
+      from haystack.components.generators.chat import HuggingFaceAPIChatGenerator
+      from haystack.components.embedders import HuggingFaceAPITextEmbedder, HuggingFaceAPIDocumentEmbedder
+      from haystack.components.rankers import HuggingFaceTEIRanker
+
+    After:
+
+    .. code:: python
+
+      from haystack_integrations.components.generators.huggingface_api import HuggingFaceAPIChatGenerator
+      from haystack_integrations.components.embedders.huggingface_api import (
+          HuggingFaceAPITextEmbedder,
+          HuggingFaceAPIDocumentEmbedder,
+      )
+      from haystack_integrations.components.rankers.huggingface_api import HuggingFaceTEIRanker
diff --git a/test/components/embedders/test_hugging_face_api_document_embedder.py b/test/components/embedders/test_hugging_face_api_document_embedder.py
deleted file mode 100644
index 477123ae51..0000000000
--- a/test/components/embedders/test_hugging_face_api_document_embedder.py
+++ /dev/null
@@ -1,604 +0,0 @@
-# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import random
-import sys
-from unittest.mock import MagicMock, patch
-
-import pytest
-from huggingface_hub.utils import RepositoryNotFoundError
-from numpy import array
-
-from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder
-from haystack.dataclasses import Document
-from haystack.utils.auth import Secret
-from haystack.utils.hf import HFEmbeddingAPIType
-
-
-@pytest.fixture
-def mock_check_valid_model():
-    with patch(
-        "haystack.components.embedders.hugging_face_api_document_embedder.check_valid_model",
-        MagicMock(return_value=None),
-    ) as mock:
-        yield mock
-
-
-def mock_embedding_generation(text, **kwargs):
-    return array([[random.random() for _ in range(384)] for _ in range(len(text))])
-
-
-class TestHuggingFaceAPIDocumentEmbedder:
-    def test_init_invalid_api_type(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPIDocumentEmbedder(api_type="invalid_api_type", api_params={})
-
-    def test_init_serverless(self, mock_check_valid_model):
-        model = "BAAI/bge-small-en-v1.5"
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": model}
-        )
-
-        assert embedder.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API
-        assert embedder.api_params == {"model": model}
-        assert embedder.prefix == ""
-        assert embedder.suffix == ""
-        assert embedder.truncate
-        assert not embedder.normalize
-        assert embedder.batch_size == 32
-        assert embedder.progress_bar
-        assert embedder.meta_fields_to_embed == []
-        assert embedder.embedding_separator == "\n"
-
-    def test_init_serverless_invalid_model(self, mock_check_valid_model):
-        mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id", response=MagicMock())
-        with pytest.raises(RepositoryNotFoundError):
-            HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "invalid_model_id"}
-            )
-
-    def test_init_serverless_no_model(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"param": "irrelevant"}
-            )
-
-    def test_init_tei(self):
-        url = "https://some_model.com"
-
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"url": url}
-        )
-
-        assert embedder.api_type == HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE
-        assert embedder.api_params == {"url": url}
-        assert embedder.prefix == ""
-        assert embedder.suffix == ""
-        assert embedder.truncate
-        assert not embedder.normalize
-        assert embedder.batch_size == 32
-        assert embedder.progress_bar
-        assert embedder.meta_fields_to_embed == []
-        assert embedder.embedding_separator == "\n"
-
-    def test_init_tei_invalid_url(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"url": "invalid_url"}
-            )
-
-    def test_init_tei_no_url(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"param": "irrelevant"}
-            )
-
-    def test_to_dict(self, mock_check_valid_model):
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "BAAI/bge-small-en-v1.5"},
-            prefix="prefix",
-            suffix="suffix",
-            truncate=False,
-            normalize=True,
-            batch_size=128,
-            progress_bar=False,
-            meta_fields_to_embed=["meta_field"],
-            embedding_separator=" ",
-            concurrency_limit=7,
-        )
-
-        data = embedder.to_dict()
-
-        assert data == {
-            "type": "haystack.components.embedders.hugging_face_api_document_embedder.HuggingFaceAPIDocumentEmbedder",
-            "init_parameters": {
-                "api_type": "serverless_inference_api",
-                "api_params": {"model": "BAAI/bge-small-en-v1.5"},
-                "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"},
-                "prefix": "prefix",
-                "suffix": "suffix",
-                "truncate": False,
-                "normalize": True,
-                "batch_size": 128,
-                "progress_bar": False,
-                "meta_fields_to_embed": ["meta_field"],
-                "embedding_separator": " ",
-                "concurrency_limit": 7,
-            },
-        }
-
-    def test_from_dict(self, mock_check_valid_model):
-        data = {
-            "type": "haystack.components.embedders.hugging_face_api_document_embedder.HuggingFaceAPIDocumentEmbedder",
-            "init_parameters": {
-                "api_type": HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                "api_params": {"model": "BAAI/bge-small-en-v1.5"},
-                "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"},
-                "prefix": "prefix",
-                "suffix": "suffix",
-                "truncate": False,
-                "normalize": True,
-                "batch_size": 128,
-                "progress_bar": False,
-                "meta_fields_to_embed": ["meta_field"],
-                "embedding_separator": " ",
-                "concurrency_limit": 7,
-            },
-        }
-
-        embedder = HuggingFaceAPIDocumentEmbedder.from_dict(data)
-
-        assert embedder.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API
-        assert embedder.api_params == {"model": "BAAI/bge-small-en-v1.5"}
-        assert embedder.prefix == "prefix"
-        assert embedder.suffix == "suffix"
-        assert not embedder.truncate
-        assert embedder.normalize
-        assert embedder.batch_size == 128
-        assert not embedder.progress_bar
-        assert embedder.meta_fields_to_embed == ["meta_field"]
-        assert embedder.embedding_separator == " "
-        assert embedder.concurrency_limit == 7
-
-    def test_prepare_texts_to_embed_w_metadata(self):
-        documents = [
-            Document(content=f"document number {i}: content", meta={"meta_field": f"meta_value {i}"}) for i in range(5)
-        ]
-
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE,
-            api_params={"url": "https://some_model.com"},
-            token=Secret.from_token("fake-api-token"),
-            meta_fields_to_embed=["meta_field"],
-            embedding_separator=" | ",
-        )
-
-        prepared_texts = embedder._prepare_texts_to_embed(documents)
-
-        assert prepared_texts == [
-            "meta_value 0 | document number 0: content",
-            "meta_value 1 | document number 1: content",
-            "meta_value 2 | document number 2: content",
-            "meta_value 3 | document number 3: content",
-            "meta_value 4 | document number 4: content",
-        ]
-
-    def test_prepare_texts_to_embed_w_suffix(self, mock_check_valid_model):
-        documents = [Document(content=f"document number {i}") for i in range(5)]
-
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE,
-            api_params={"url": "https://some_model.com"},
-            token=Secret.from_token("fake-api-token"),
-            prefix="my_prefix ",
-            suffix=" my_suffix",
-        )
-
-        prepared_texts = embedder._prepare_texts_to_embed(documents)
-
-        assert prepared_texts == [
-            "my_prefix document number 0 my_suffix",
-            "my_prefix document number 1 my_suffix",
-            "my_prefix document number 2 my_suffix",
-            "my_prefix document number 3 my_suffix",
-            "my_prefix document number 4 my_suffix",
-        ]
-
-    def test_embed_batch(self, mock_check_valid_model, caplog):
-        texts = ["text 1", "text 2", "text 3", "text 4", "text 5"]
-
-        with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.side_effect = mock_embedding_generation
-
-            embedder = HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-            )
-            embeddings = embedder._embed_batch(texts_to_embed=texts, batch_size=2)
-
-            assert mock_embedding_patch.call_count == 3
-
-        assert isinstance(embeddings, list)
-        assert len(embeddings) == len(texts)
-        for embedding in embeddings:
-            assert isinstance(embedding, list)
-            assert len(embedding) == 384
-            assert all(isinstance(x, float) for x in embedding)
-
-        # Check that logger warnings about ignoring truncate and normalize are raised
-        assert len(caplog.records) == 2
-        assert "truncate" in caplog.records[0].message
-        assert "normalize" in caplog.records[1].message
-
-    def test_embed_batch_wrong_embedding_shape(self, mock_check_valid_model):
-        texts = ["text 1", "text 2", "text 3", "text 4", "text 5"]
-
-        # embedding ndim != 2
-        with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.return_value = array([0.1, 0.2, 0.3])
-
-            embedder = HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-            )
-
-            with pytest.raises(ValueError):
-                embedder._embed_batch(texts_to_embed=texts, batch_size=2)
-
-        # embedding ndim == 2 but shape[0] != len(batch)
-        with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.return_value = array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])
-
-            embedder = HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-            )
-
-            with pytest.raises(ValueError):
-                embedder._embed_batch(texts_to_embed=texts, batch_size=2)
-
-    def test_run_wrong_input_format(self, mock_check_valid_model):
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "BAAI/bge-small-en-v1.5"}
-        )
-
-        list_integers_input = [1, 2, 3]
-
-        with pytest.raises(TypeError):
-            embedder.run(text=list_integers_input)
-
-    def test_run_on_empty_list(self, mock_check_valid_model):
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "BAAI/bge-small-en-v1.5"},
-            token=Secret.from_token("fake-api-token"),
-        )
-
-        empty_list_input = []
-        result = embedder.run(documents=empty_list_input)
-
-        assert result["documents"] is not None
-        assert not result["documents"]  # empty list
-
-    def test_run(self, mock_check_valid_model):
-        docs = [
-            Document(content="I love cheese", meta={"topic": "Cuisine"}),
-            Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
-        ]
-        with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.side_effect = mock_embedding_generation
-
-            embedder = HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-                prefix="prefix ",
-                suffix=" suffix",
-                meta_fields_to_embed=["topic"],
-                embedding_separator=" | ",
-            )
-
-            result = embedder.run(documents=docs)
-
-            mock_embedding_patch.assert_called_once_with(
-                text=[
-                    "prefix Cuisine | I love cheese suffix",
-                    "prefix ML | A transformer is a deep learning architecture suffix",
-                ],
-                truncate=None,
-                normalize=None,
-            )
-
-        documents_with_embeddings = result["documents"]
-
-        assert isinstance(documents_with_embeddings, list)
-        assert len(documents_with_embeddings) == len(docs)
-        for doc, new_doc in zip(docs, documents_with_embeddings, strict=True):
-            assert doc.embedding is None
-            assert new_doc is not doc
-            assert isinstance(new_doc, Document)
-            assert isinstance(new_doc.embedding, list)
-            assert len(new_doc.embedding) == 384
-            assert all(isinstance(x, float) for x in new_doc.embedding)
-
-    def test_run_custom_batch_size(self, mock_check_valid_model):
-        docs = [
-            Document(content="I love cheese", meta={"topic": "Cuisine"}),
-            Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
-        ]
-
-        with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.side_effect = mock_embedding_generation
-
-            embedder = HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-                prefix="prefix ",
-                suffix=" suffix",
-                meta_fields_to_embed=["topic"],
-                embedding_separator=" | ",
-                batch_size=1,
-            )
-
-            result = embedder.run(documents=docs)
-
-            assert mock_embedding_patch.call_count == 2
-
-        documents_with_embeddings = result["documents"]
-
-        assert isinstance(documents_with_embeddings, list)
-        assert len(documents_with_embeddings) == len(docs)
-        for doc in documents_with_embeddings:
-            assert isinstance(doc, Document)
-            assert isinstance(doc.embedding, list)
-            assert len(doc.embedding) == 384
-            assert all(isinstance(x, float) for x in doc.embedding)
-
-    def test_adjust_api_parameters(self):
-        truncate, normalize = HuggingFaceAPIDocumentEmbedder._adjust_api_parameters(
-            True, False, HFEmbeddingAPIType.SERVERLESS_INFERENCE_API
-        )
-        assert truncate is None
-        assert normalize is None
-
-        truncate, normalize = HuggingFaceAPIDocumentEmbedder._adjust_api_parameters(
-            True, False, HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE
-        )
-        assert truncate is True
-        assert normalize is False
-
-    @pytest.mark.integration
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        not os.environ.get("HF_API_TOKEN", None),
-        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
-    )
-    @pytest.mark.flaky(reruns=3, reruns_delay=10)
-    @pytest.mark.skipif(sys.platform != "linux", reason="We only test on Linux to avoid overloading the HF server")
-    @pytest.mark.xfail(reason="hf-inference is temporarily returning 500s")
-    def test_live_run_serverless(self):
-        docs = [
-            Document(content="I love cheese", meta={"topic": "Cuisine"}),
-            Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
-        ]
-
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "sentence-transformers/all-MiniLM-L6-v2"},
-            meta_fields_to_embed=["topic"],
-            embedding_separator=" | ",
-        )
-        embedder._client.timeout = 10  # we want to fail fast if the server is not responding
-        result = embedder.run(documents=docs)
-        documents_with_embeddings = result["documents"]
-
-        assert isinstance(documents_with_embeddings, list)
-        assert len(documents_with_embeddings) == len(docs)
-        for doc in documents_with_embeddings:
-            assert isinstance(doc, Document)
-            assert isinstance(doc.embedding, list)
-            assert len(doc.embedding) == 384
-            assert all(isinstance(x, float) for x in doc.embedding)
-
-    @pytest.mark.asyncio
-    @pytest.mark.integration
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        not os.environ.get("HF_API_TOKEN", None),
-        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
-    )
-    @pytest.mark.flaky(reruns=3, reruns_delay=10)
-    @pytest.mark.skipif(sys.platform != "linux", reason="We only test on Linux to avoid overloading the HF server")
-    @pytest.mark.xfail(reason="hf-inference is temporarily returning 500s")
-    async def test_live_run_serverless_async(self) -> None:
-        docs = [
-            Document(content="I love cheese", meta={"topic": "Cuisine"}),
-            Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
-        ]
-
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "sentence-transformers/all-MiniLM-L6-v2"},
-            meta_fields_to_embed=["topic"],
-            embedding_separator=" | ",
-        )
-        embedder._async_client.timeout = 10  # we want to fail fast if the server is not responding
-        result = await embedder.run_async(documents=docs)
-        documents_with_embeddings = result["documents"]
-
-        assert isinstance(documents_with_embeddings, list)
-        assert len(documents_with_embeddings) == len(docs)
-        for doc in documents_with_embeddings:
-            assert isinstance(doc, Document)
-            assert isinstance(doc.embedding, list)
-            assert len(doc.embedding) == 384
-            assert all(isinstance(x, float) for x in doc.embedding)
-
-    @pytest.mark.asyncio
-    async def test_embed_batch_async(self, mock_check_valid_model, caplog):
-        texts = ["text 1", "text 2", "text 3", "text 4", "text 5"]
-
-        with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.side_effect = mock_embedding_generation
-
-            embedder = HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-                concurrency_limit=4,
-            )
-            embeddings = await embedder._embed_batch_async(texts_to_embed=texts, batch_size=2)
-
-            assert mock_embedding_patch.call_count == 3
-
-        assert isinstance(embeddings, list)
-        assert len(embeddings) == len(texts)
-        for embedding in embeddings:
-            assert isinstance(embedding, list)
-            assert len(embedding) == 384
-            assert all(isinstance(x, float) for x in embedding)
-
-        # Check that logger warnings about ignoring truncate and normalize are raised
-        assert len(caplog.records) == 2
-        assert "truncate" in caplog.records[0].message
-        assert "normalize" in caplog.records[1].message
-
-    @pytest.mark.asyncio
-    async def test_embed_batch_async_wrong_embedding_shape(self, mock_check_valid_model):
-        texts = ["text 1", "text 2", "text 3", "text 4", "text 5"]
-
-        # embedding ndim != 2
-        with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.return_value = array([0.1, 0.2, 0.3])
-
-            embedder = HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-                concurrency_limit=1,
-            )
-
-            with pytest.raises(ValueError):
-                await embedder._embed_batch_async(texts_to_embed=texts, batch_size=2)
-
-        # embedding ndim == 2 but shape[0] != len(batch)
-        with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.return_value = array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]])
-
-            embedder = HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-                concurrency_limit=1,
-            )
-
-            with pytest.raises(ValueError):
-                await embedder._embed_batch_async(texts_to_embed=texts, batch_size=2)
-
-    @pytest.mark.asyncio
-    async def test_run_async_wrong_input_format(self, mock_check_valid_model):
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "BAAI/bge-small-en-v1.5"}
-        )
-
-        list_integers_input = [1, 2, 3]
-
-        with pytest.raises(TypeError):
-            await embedder.run_async(text=list_integers_input)
-
-    @pytest.mark.asyncio
-    async def test_run_async_on_empty_list(self, mock_check_valid_model):
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "BAAI/bge-small-en-v1.5"},
-            token=Secret.from_token("fake-api-token"),
-        )
-
-        empty_list_input = []
-        result = await embedder.run_async(documents=empty_list_input)
-
-        assert result["documents"] is not None
-        assert not result["documents"]  # empty list.
-
-    @pytest.mark.asyncio
-    async def test_run_async(self, mock_check_valid_model):
-        docs = [
-            Document(content="I love cheese", meta={"topic": "Cuisine"}),
-            Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
-        ]
-
-        with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.side_effect = mock_embedding_generation
-
-            embedder = HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-                prefix="prefix ",
-                suffix=" suffix",
-                meta_fields_to_embed=["topic"],
-                embedding_separator=" | ",
-            )
-
-            result = await embedder.run_async(documents=docs)
-
-            mock_embedding_patch.assert_called_once_with(
-                text=[
-                    "prefix Cuisine | I love cheese suffix",
-                    "prefix ML | A transformer is a deep learning architecture suffix",
-                ],
-                truncate=None,
-                normalize=None,
-            )
-
-        documents_with_embeddings = result["documents"]
-
-        assert isinstance(documents_with_embeddings, list)
-        assert len(documents_with_embeddings) == len(docs)
-        for doc in documents_with_embeddings:
-            assert isinstance(doc, Document)
-            assert isinstance(doc.embedding, list)
-            assert len(doc.embedding) == 384
-            assert all(isinstance(x, float) for x in doc.embedding)
-
-    @pytest.mark.asyncio
-    async def test_run_async_custom_batch_size(self, mock_check_valid_model):
-        docs = [
-            Document(content="I love cheese", meta={"topic": "Cuisine"}),
-            Document(content="A transformer is a deep learning architecture", meta={"topic": "ML"}),
-        ]
-
-        with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.side_effect = mock_embedding_generation
-
-            embedder = HuggingFaceAPIDocumentEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-                prefix="prefix ",
-                suffix=" suffix",
-                meta_fields_to_embed=["topic"],
-                embedding_separator=" | ",
-                batch_size=1,
-            )
-
-            result = await embedder.run_async(documents=docs)
-
-            assert mock_embedding_patch.call_count == 2
-
-        documents_with_embeddings = result["documents"]
-
-        assert isinstance(documents_with_embeddings, list)
-        assert len(documents_with_embeddings) == len(docs)
-        for doc in documents_with_embeddings:
-            assert isinstance(doc, Document)
-            assert isinstance(doc.embedding, list)
-            assert len(doc.embedding) == 384
-            assert all(isinstance(x, float) for x in doc.embedding)
diff --git a/test/components/embedders/test_hugging_face_api_text_embedder.py b/test/components/embedders/test_hugging_face_api_text_embedder.py
deleted file mode 100644
index 53e0d41406..0000000000
--- a/test/components/embedders/test_hugging_face_api_text_embedder.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-import random
-import sys
-from unittest.mock import MagicMock, patch
-
-import pytest
-from huggingface_hub.utils import RepositoryNotFoundError
-from numpy import array
-
-from haystack.components.embedders import HuggingFaceAPITextEmbedder
-from haystack.utils.auth import Secret
-from haystack.utils.hf import HFEmbeddingAPIType
-
-
-@pytest.fixture
-def mock_check_valid_model():
-    with patch(
-        "haystack.components.embedders.hugging_face_api_text_embedder.check_valid_model", MagicMock(return_value=None)
-    ) as mock:
-        yield mock
-
-
-class TestHuggingFaceAPITextEmbedder:
-    def test_init_invalid_api_type(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPITextEmbedder(api_type="invalid_api_type", api_params={})
-
-    def test_init_serverless(self, mock_check_valid_model):
-        model = "BAAI/bge-small-en-v1.5"
-        embedder = HuggingFaceAPITextEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": model}
-        )
-
-        assert embedder.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API
-        assert embedder.api_params == {"model": model}
-        assert embedder.prefix == ""
-        assert embedder.suffix == ""
-        assert embedder.truncate
-        assert not embedder.normalize
-
-    def test_init_serverless_invalid_model(self, mock_check_valid_model):
-        mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id", response=MagicMock())
-        with pytest.raises(RepositoryNotFoundError):
-            HuggingFaceAPITextEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "invalid_model_id"}
-            )
-
-    def test_init_serverless_no_model(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPITextEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"param": "irrelevant"}
-            )
-
-    def test_init_tei(self):
-        url = "https://some_model.com"
-
-        embedder = HuggingFaceAPITextEmbedder(
-            api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"url": url}
-        )
-
-        assert embedder.api_type == HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE
-        assert embedder.api_params == {"url": url}
-        assert embedder.prefix == ""
-        assert embedder.suffix == ""
-        assert embedder.truncate
-        assert not embedder.normalize
-
-    def test_init_tei_invalid_url(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPITextEmbedder(
-                api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"url": "invalid_url"}
-            )
-
-    def test_init_tei_no_url(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPITextEmbedder(
-                api_type=HFEmbeddingAPIType.TEXT_EMBEDDINGS_INFERENCE, api_params={"param": "irrelevant"}
-            )
-
-    def test_to_dict(self, mock_check_valid_model):
-        embedder = HuggingFaceAPITextEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "BAAI/bge-small-en-v1.5"},
-            prefix="prefix",
-            suffix="suffix",
-            truncate=False,
-            normalize=True,
-        )
-
-        data = embedder.to_dict()
-
-        assert data == {
-            "type": "haystack.components.embedders.hugging_face_api_text_embedder.HuggingFaceAPITextEmbedder",
-            "init_parameters": {
-                "api_type": "serverless_inference_api",
-                "api_params": {"model": "BAAI/bge-small-en-v1.5"},
-                "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"},
-                "prefix": "prefix",
-                "suffix": "suffix",
-                "truncate": False,
-                "normalize": True,
-            },
-        }
-
-    def test_from_dict(self, mock_check_valid_model):
-        data = {
-            "type": "haystack.components.embedders.hugging_face_api_text_embedder.HuggingFaceAPITextEmbedder",
-            "init_parameters": {
-                "api_type": HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                "api_params": {"model": "BAAI/bge-small-en-v1.5"},
-                "token": {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"},
-                "prefix": "prefix",
-                "suffix": "suffix",
-                "truncate": False,
-                "normalize": True,
-            },
-        }
-
-        embedder = HuggingFaceAPITextEmbedder.from_dict(data)
-
-        assert embedder.api_type == HFEmbeddingAPIType.SERVERLESS_INFERENCE_API
-        assert embedder.api_params == {"model": "BAAI/bge-small-en-v1.5"}
-        assert embedder.prefix == "prefix"
-        assert embedder.suffix == "suffix"
-        assert not embedder.truncate
-        assert embedder.normalize
-
-    def test_run_wrong_input_format(self, mock_check_valid_model):
-        embedder = HuggingFaceAPITextEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "BAAI/bge-small-en-v1.5"}
-        )
-
-        list_integers_input = [1, 2, 3]
-
-        with pytest.raises(TypeError):
-            embedder.run(text=list_integers_input)
-
-    def test_run(self, mock_check_valid_model, caplog):
-        with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.return_value = array([[random.random() for _ in range(384)]])
-
-            embedder = HuggingFaceAPITextEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-                prefix="prefix ",
-                suffix=" suffix",
-            )
-
-            result = embedder.run(text="The food was delicious")
-
-            mock_embedding_patch.assert_called_once_with(
-                text="prefix The food was delicious suffix", truncate=None, normalize=None
-            )
-
-        assert len(result["embedding"]) == 384
-        assert all(isinstance(x, float) for x in result["embedding"])
-
-        # Check that warnings about ignoring truncate and normalize are raised
-        assert len(caplog.records) == 2
-        assert "truncate" in caplog.records[0].message
-        assert "normalize" in caplog.records[1].message
-
-    @pytest.mark.asyncio
-    async def test_run_async(self, mock_check_valid_model, caplog):
-        with patch("huggingface_hub.AsyncInferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.return_value = array([[random.random() for _ in range(384)]])
-
-            embedder = HuggingFaceAPITextEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "BAAI/bge-small-en-v1.5"},
-                token=Secret.from_token("fake-api-token"),
-                prefix="prefix ",
-                suffix=" suffix",
-            )
-
-            result = await embedder.run_async(text="The food was delicious")
-
-            mock_embedding_patch.assert_called_once_with(
-                text="prefix The food was delicious suffix", truncate=None, normalize=None
-            )
-
-        assert len(result["embedding"]) == 384
-        assert all(isinstance(x, float) for x in result["embedding"])
-
-        # Check that warnings about ignoring truncate and normalize are raised
-        assert len(caplog.records) == 2
-        assert "truncate" in caplog.records[0].message
-        assert "normalize" in caplog.records[1].message
-
-    def test_run_wrong_embedding_shape(self, mock_check_valid_model):
-        # embedding ndim > 2
-        with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.return_value = array([[[0.1, 0.2, 0.3], [0.4, 0.5, 0.6], [0.7, 0.8, 0.9]]])
-
-            embedder = HuggingFaceAPITextEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "BAAI/bge-small-en-v1.5"}
-            )
-
-            with pytest.raises(ValueError):
-                embedder.run(text="The food was delicious")
-
-        # embedding ndim == 2 but shape[0] != 1
-        with patch("huggingface_hub.InferenceClient.feature_extraction") as mock_embedding_patch:
-            mock_embedding_patch.return_value = array([[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]])
-
-            embedder = HuggingFaceAPITextEmbedder(
-                api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "BAAI/bge-small-en-v1.5"}
-            )
-
-            with pytest.raises(ValueError):
-                embedder.run(text="The food was delicious")
-
-    @pytest.mark.integration
-    @pytest.mark.slow
-    @pytest.mark.flaky(reruns=3, reruns_delay=10)
-    @pytest.mark.skipif(
-        not os.environ.get("HF_API_TOKEN", None),
-        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
-    )
-    @pytest.mark.skipif(sys.platform != "linux", reason="We only test on Linux to avoid overloading the HF server")
-    @pytest.mark.xfail(reason="hf-inference is temporarily returning 500s")
-    def test_live_run_serverless(self):
-        embedder = HuggingFaceAPITextEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "sentence-transformers/all-MiniLM-L6-v2"},
-        )
-        embedder._client.timeout = 10  # we want to fail fast if the server is not responding
-        result = embedder.run(text="The food was delicious")
-
-        assert len(result["embedding"]) == 384
-        assert all(isinstance(x, float) for x in result["embedding"])
-
-    @pytest.mark.integration
-    @pytest.mark.asyncio
-    @pytest.mark.slow
-    @pytest.mark.flaky(reruns=3, reruns_delay=10)
-    @pytest.mark.skipif(os.environ.get("HF_API_TOKEN", "") == "", reason="HF_API_TOKEN is not set")
-    @pytest.mark.skipif(sys.platform != "linux", reason="We only test on Linux to avoid overloading the HF server")
-    @pytest.mark.xfail(reason="hf-inference is temporarily returning 500s")
-    async def test_live_run_async_serverless(self):
-        model_name = "sentence-transformers/all-MiniLM-L6-v2"
-
-        embedder = HuggingFaceAPITextEmbedder(
-            api_type=HFEmbeddingAPIType.SERVERLESS_INFERENCE_API, api_params={"model": model_name}
-        )
-        embedder._client.timeout = 10  # we want to fail fast if the server is not responding
-
-        text = "This is a test sentence for embedding."
-        result = await embedder.run_async(text=text)
-
-        assert "embedding" in result
-        assert isinstance(result["embedding"], list)
-        assert all(isinstance(x, float) for x in result["embedding"])
-        assert len(result["embedding"]) == 384  # MiniLM-L6-v2 has 384 dimensions
diff --git a/test/components/generators/chat/test_hugging_face_api.py b/test/components/generators/chat/test_hugging_face_api.py
deleted file mode 100644
index 50a0fcb368..0000000000
--- a/test/components/generators/chat/test_hugging_face_api.py
+++ /dev/null
@@ -1,1756 +0,0 @@
-# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
-#
-# SPDX-License-Identifier: Apache-2.0
-
-import os
-from datetime import datetime
-from typing import Any
-from unittest.mock import AsyncMock, MagicMock, Mock, patch
-
-import pytest
-from huggingface_hub import (
-    ChatCompletionInputStreamOptions,
-    ChatCompletionOutput,
-    ChatCompletionOutputComplete,
-    ChatCompletionOutputFunctionDefinition,
-    ChatCompletionOutputMessage,
-    ChatCompletionOutputToolCall,
-    ChatCompletionOutputUsage,
-    ChatCompletionStreamOutput,
-    ChatCompletionStreamOutputChoice,
-    ChatCompletionStreamOutputDelta,
-    ChatCompletionStreamOutputUsage,
-)
-from huggingface_hub.errors import RepositoryNotFoundError
-
-from haystack import Pipeline
-from haystack.components.generators.chat.hugging_face_api import (
-    HuggingFaceAPIChatGenerator,
-    _convert_chat_completion_stream_output_to_streaming_chunk,
-    _convert_hfapi_tool_calls,
-    _convert_tools_to_hfapi_tools,
-    _resolve_schema_refs,
-)
-from haystack.dataclasses import ChatMessage, ImageContent, ReasoningContent, StreamingChunk, ToolCall
-from haystack.tools import Tool
-from haystack.tools.toolset import Toolset
-from haystack.utils.auth import Secret
-from haystack.utils.hf import HFGenerationAPIType
-
-
-@pytest.fixture
-def chat_messages():
-    return [
-        ChatMessage.from_system("You are a helpful assistant speaking A2 level of English"),
-        ChatMessage.from_user("Tell me about Berlin"),
-    ]
-
-
-def get_weather(city: str) -> dict[str, Any]:
-    weather_info = {
-        "Berlin": {"weather": "mostly sunny", "temperature": 7, "unit": "celsius"},
-        "Paris": {"weather": "mostly cloudy", "temperature": 8, "unit": "celsius"},
-        "Rome": {"weather": "sunny", "temperature": 14, "unit": "celsius"},
-    }
-    return weather_info.get(city, {"weather": "unknown", "temperature": 0, "unit": "celsius"})
-
-
-@pytest.fixture
-def tools():
-    weather_tool = Tool(
-        name="weather",
-        description="useful to determine the weather in a given location",
-        parameters={"type": "object", "properties": {"city": {"type": "string"}}, "required": ["city"]},
-        function=get_weather,
-    )
-    return [weather_tool]
-
-
-@pytest.fixture
-def mock_check_valid_model():
-    with patch(
-        "haystack.components.generators.chat.hugging_face_api.check_valid_model", MagicMock(return_value=None)
-    ) as mock:
-        yield mock
-
-
-@pytest.fixture
-def mock_chat_completion():
-    # https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.chat_completion.example
-
-    with patch("huggingface_hub.InferenceClient.chat_completion", autospec=True) as mock_chat_completion:
-        completion = ChatCompletionOutput(
-            choices=[
-                ChatCompletionOutputComplete(
-                    finish_reason="eos_token",
-                    index=0,
-                    message=ChatCompletionOutputMessage(content="The capital of France is Paris.", role="assistant"),
-                )
-            ],
-            id="some_id",
-            model="some_model",
-            system_fingerprint="some_fingerprint",
-            usage=ChatCompletionOutputUsage(completion_tokens=8, prompt_tokens=17, total_tokens=25),
-            created=1710498360,
-        )
-
-        mock_chat_completion.return_value = completion
-        yield mock_chat_completion
-
-
-@pytest.fixture
-def mock_chat_completion_async():
-    with patch("huggingface_hub.AsyncInferenceClient.chat_completion", autospec=True) as mock_chat_completion:
-        completion = ChatCompletionOutput(
-            choices=[
-                ChatCompletionOutputComplete(
-                    finish_reason="eos_token",
-                    index=0,
-                    message=ChatCompletionOutputMessage(content="The capital of France is Paris.", role="assistant"),
-                )
-            ],
-            id="some_id",
-            model="some_model",
-            system_fingerprint="some_fingerprint",
-            usage=ChatCompletionOutputUsage(completion_tokens=8, prompt_tokens=17, total_tokens=25),
-            created=1710498360,
-        )
-
-        # Use AsyncMock to properly mock the async method
-        mock_chat_completion.return_value = completion
-        mock_chat_completion.__call__ = AsyncMock(return_value=completion)
-
-        yield mock_chat_completion
-
-
-# used to test serialization of streaming_callback
-def streaming_callback_handler(x):
-    return x
-
-
-class TestHuggingFaceAPIChatGenerator:
-    def test_init_invalid_api_type(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPIChatGenerator(api_type="invalid_api_type", api_params={})
-
-    def test_init_serverless(self, mock_check_valid_model):
-        model = "HuggingFaceH4/zephyr-7b-alpha"
-        generation_kwargs = {"temperature": 0.6}
-        stop_words = ["stop"]
-        streaming_callback = None
-
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": model},
-            token=None,
-            generation_kwargs=generation_kwargs,
-            stop_words=stop_words,
-            streaming_callback=streaming_callback,
-        )
-
-        assert generator.api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API
-        assert generator.api_params == {"model": model}
-        assert generator.generation_kwargs == {**generation_kwargs, "stop": ["stop"], "max_tokens": 512}
-        assert generator.streaming_callback == streaming_callback
-        assert generator.tools is None
-
-        # check that client and async_client are initialized
-        assert generator._client.model == model
-        assert generator._async_client.model == model
-
-    def test_init_serverless_with_tools(self, mock_check_valid_model, tools):
-        model = "HuggingFaceH4/zephyr-7b-alpha"
-        generation_kwargs = {"temperature": 0.6}
-        stop_words = ["stop"]
-        streaming_callback = None
-
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": model},
-            token=None,
-            generation_kwargs=generation_kwargs,
-            stop_words=stop_words,
-            streaming_callback=streaming_callback,
-            tools=tools,
-        )
-
-        assert generator.api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API
-        assert generator.api_params == {"model": model}
-        assert generator.generation_kwargs == {**generation_kwargs, "stop": ["stop"], "max_tokens": 512}
-        assert generator.streaming_callback == streaming_callback
-        assert generator.tools == tools
-
-        assert generator._client.model == model
-        assert generator._async_client.model == model
-
-    def test_init_serverless_invalid_model(self, mock_check_valid_model):
-        mock_check_valid_model.side_effect = RepositoryNotFoundError("Invalid model id", response=MagicMock())
-        with pytest.raises(RepositoryNotFoundError):
-            HuggingFaceAPIChatGenerator(
-                api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "invalid_model_id"}
-            )
-
-    def test_init_serverless_no_model(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPIChatGenerator(
-                api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"param": "irrelevant"}
-            )
-
-    def test_init_tgi(self):
-        url = "https://some_model.com"
-        generation_kwargs = {"temperature": 0.6}
-        stop_words = ["stop"]
-        streaming_callback = None
-
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.TEXT_GENERATION_INFERENCE,
-            api_params={"url": url},
-            token=None,
-            generation_kwargs=generation_kwargs,
-            stop_words=stop_words,
-            streaming_callback=streaming_callback,
-        )
-
-        assert generator.api_type == HFGenerationAPIType.TEXT_GENERATION_INFERENCE
-        assert generator.api_params == {"url": url}
-        assert generator.generation_kwargs == {**generation_kwargs, "stop": ["stop"], "max_tokens": 512}
-        assert generator.streaming_callback == streaming_callback
-        assert generator.tools is None
-
-        assert generator._client.model == url
-        assert generator._async_client.model == url
-
-    def test_init_tgi_invalid_url(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPIChatGenerator(
-                api_type=HFGenerationAPIType.TEXT_GENERATION_INFERENCE, api_params={"url": "invalid_url"}
-            )
-
-    def test_init_tgi_no_url(self):
-        with pytest.raises(ValueError):
-            HuggingFaceAPIChatGenerator(
-                api_type=HFGenerationAPIType.TEXT_GENERATION_INFERENCE, api_params={"param": "irrelevant"}
-            )
-
-    def test_init_fail_with_duplicate_tool_names(self, mock_check_valid_model, tools):
-        duplicate_tools = [tools[0], tools[0]]
-        with pytest.raises(ValueError):
-            HuggingFaceAPIChatGenerator(
-                api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "irrelevant"},
-                tools=duplicate_tools,
-            )
-
-    def test_init_fail_with_tools_and_streaming(self, mock_check_valid_model, tools):
-        with pytest.raises(ValueError):
-            HuggingFaceAPIChatGenerator(
-                api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "irrelevant"},
-                tools=tools,
-                streaming_callback=streaming_callback_handler,
-            )
-
-    def test_to_dict(self, mock_check_valid_model):
-        tool = Tool(name="name", description="description", parameters={"x": {"type": "string"}}, function=print)
-
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
-            generation_kwargs={"temperature": 0.6},
-            stop_words=["stop", "words"],
-            tools=[tool],
-        )
-
-        result = generator.to_dict()
-        init_params = result["init_parameters"]
-
-        assert init_params["api_type"] == "serverless_inference_api"
-        assert init_params["api_params"] == {"model": "HuggingFaceH4/zephyr-7b-beta"}
-        assert init_params["token"] == {"env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False, "type": "env_var"}
-        assert init_params["generation_kwargs"] == {"temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512}
-        assert init_params["streaming_callback"] is None
-        assert init_params["tools"] == [
-            {
-                "type": "haystack.tools.tool.Tool",
-                "data": {
-                    "async_function": None,
-                    "description": "description",
-                    "function": "builtins.print",
-                    "inputs_from_state": None,
-                    "name": "name",
-                    "outputs_to_state": None,
-                    "outputs_to_string": None,
-                    "parameters": {"x": {"type": "string"}},
-                },
-            }
-        ]
-
-    def test_from_dict(self, mock_check_valid_model):
-        tool = Tool(name="name", description="description", parameters={"x": {"type": "string"}}, function=print)
-
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
-            token=Secret.from_env_var("ENV_VAR", strict=False),
-            generation_kwargs={"temperature": 0.6},
-            stop_words=["stop", "words"],
-            tools=[tool],
-        )
-        result = generator.to_dict()
-
-        # now deserialize, call from_dict
-        generator_2 = HuggingFaceAPIChatGenerator.from_dict(result)
-        assert generator_2.api_type == HFGenerationAPIType.SERVERLESS_INFERENCE_API
-        assert generator_2.api_params == {"model": "HuggingFaceH4/zephyr-7b-beta"}
-        assert generator_2.token == Secret.from_env_var("ENV_VAR", strict=False)
-        assert generator_2.generation_kwargs == {"temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512}
-        assert generator_2.streaming_callback is None
-        assert generator_2.tools == [tool]
-
-    def test_serde_in_pipeline(self, mock_check_valid_model):
-        tool = Tool(name="name", description="description", parameters={"x": {"type": "string"}}, function=print)
-
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "HuggingFaceH4/zephyr-7b-beta"},
-            token=Secret.from_env_var("ENV_VAR", strict=False),
-            generation_kwargs={"temperature": 0.6},
-            stop_words=["stop", "words"],
-            tools=[tool],
-        )
-
-        pipeline = Pipeline()
-        pipeline.add_component("generator", generator)
-
-        pipeline_dict = pipeline.to_dict()
-        assert pipeline_dict == {
-            "metadata": {},
-            "max_runs_per_component": 100,
-            "connection_type_validation": True,
-            "components": {
-                "generator": {
-                    "type": "haystack.components.generators.chat.hugging_face_api.HuggingFaceAPIChatGenerator",
-                    "init_parameters": {
-                        "api_type": "serverless_inference_api",
-                        "api_params": {"model": "HuggingFaceH4/zephyr-7b-beta"},
-                        "token": {"type": "env_var", "env_vars": ["ENV_VAR"], "strict": False},
-                        "generation_kwargs": {"temperature": 0.6, "stop": ["stop", "words"], "max_tokens": 512},
-                        "streaming_callback": None,
-                        "tools": [
-                            {
-                                "type": "haystack.tools.tool.Tool",
-                                "data": {
-                                    "inputs_from_state": None,
-                                    "name": "name",
-                                    "outputs_to_state": None,
-                                    "outputs_to_string": None,
-                                    "description": "description",
-                                    "parameters": {"x": {"type": "string"}},
-                                    "function": "builtins.print",
-                                    "async_function": None,
-                                },
-                            }
-                        ],
-                    },
-                }
-            },
-            "connections": [],
-        }
-
-        pipeline_yaml = pipeline.dumps()
-
-        new_pipeline = Pipeline.loads(pipeline_yaml)
-        assert new_pipeline == pipeline
-
-    def test_run(self, mock_check_valid_model, mock_chat_completion, chat_messages):
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-            generation_kwargs={"temperature": 0.6},
-            stop_words=["stop", "words"],
-            streaming_callback=None,
-        )
-
-        response = generator.run(messages=chat_messages)
-
-        # check kwargs passed to chat_completion
-        _, kwargs = mock_chat_completion.call_args
-        hf_messages = [
-            {"role": "system", "content": "You are a helpful assistant speaking A2 level of English"},
-            {"role": "user", "content": "Tell me about Berlin"},
-        ]
-        assert kwargs == {
-            "temperature": 0.6,
-            "stop": ["stop", "words"],
-            "max_tokens": 512,
-            "tools": None,
-            "messages": hf_messages,
-        }
-
-        assert isinstance(response, dict)
-        assert "replies" in response
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) == 1
-        assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
-
-    def test_run_with_string_input(self, mock_check_valid_model, mock_chat_completion):
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-        )
-        response = generator.run("What's the capital of France?")
-
-        _, kwargs = mock_chat_completion.call_args
-        assert kwargs["messages"] == [{"role": "user", "content": "What's the capital of France?"}]
-
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) == 1
-        assert isinstance(response["replies"][0], ChatMessage)
-
-    def test_run_with_streaming_callback(self, mock_check_valid_model, mock_chat_completion, chat_messages):
-        streaming_call_count = 0
-
-        # Define the streaming callback function
-        def streaming_callback_fn(chunk: StreamingChunk):
-            nonlocal streaming_call_count
-            streaming_call_count += 1
-            assert isinstance(chunk, StreamingChunk)
-
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-            streaming_callback=streaming_callback_fn,
-        )
-
-        # Create a fake streamed response
-        # self needed here, don't remove
-        def mock_iter(self):
-            yield ChatCompletionStreamOutput(
-                choices=[
-                    ChatCompletionStreamOutputChoice(
-                        delta=ChatCompletionStreamOutputDelta(content="The", role="assistant"),
-                        index=0,
-                        finish_reason=None,
-                    )
-                ],
-                id="some_id",
-                model="some_model",
-                system_fingerprint="some_fingerprint",
-                created=1710498504,
-            )
-
-            yield ChatCompletionStreamOutput(
-                choices=[
-                    ChatCompletionStreamOutputChoice(
-                        delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason="length"
-                    )
-                ],
-                id="some_id",
-                model="some_model",
-                system_fingerprint="some_fingerprint",
-                created=1710498504,
-            )
-
-        mock_response = Mock(__iter__=mock_iter)
-        mock_chat_completion.return_value = mock_response
-
-        # Generate text response with streaming callback
-        response = generator.run(chat_messages)
-
-        # check kwargs passed to text_generation
-        _, kwargs = mock_chat_completion.call_args
-        assert kwargs == {
-            "stop": [],
-            "stream": True,
-            "max_tokens": 512,
-            "stream_options": ChatCompletionInputStreamOptions(include_usage=True),
-        }
-
-        # Assert that the streaming callback was called twice
-        assert streaming_call_count == 2
-
-        # Assert that the response contains the generated replies
-        assert "replies" in response
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) > 0
-        assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
-
-    def test_run_with_streaming_callback_in_run_method(
-        self, mock_check_valid_model, mock_chat_completion, chat_messages
-    ):
-        streaming_call_count = 0
-
-        # Define the streaming callback function
-        def streaming_callback_fn(chunk: StreamingChunk):
-            nonlocal streaming_call_count
-            streaming_call_count += 1
-            assert isinstance(chunk, StreamingChunk)
-
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-        )
-
-        # Create a fake streamed response
-        # self needed here, don't remove
-        def mock_iter(self):
-            yield ChatCompletionStreamOutput(
-                choices=[
-                    ChatCompletionStreamOutputChoice(
-                        delta=ChatCompletionStreamOutputDelta(content="The", role="assistant"),
-                        index=0,
-                        finish_reason=None,
-                    )
-                ],
-                id="some_id",
-                model="some_model",
-                system_fingerprint="some_fingerprint",
-                created=1710498504,
-            )
-
-            yield ChatCompletionStreamOutput(
-                choices=[
-                    ChatCompletionStreamOutputChoice(
-                        delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason="length"
-                    )
-                ],
-                id="some_id",
-                model="some_model",
-                system_fingerprint="some_fingerprint",
-                created=1710498504,
-            )
-
-        mock_response = Mock(__iter__=mock_iter)
-        mock_chat_completion.return_value = mock_response
-
-        # Generate text response with streaming callback
-        response = generator.run(chat_messages, streaming_callback=streaming_callback_fn)
-
-        # check kwargs passed to text_generation
-        _, kwargs = mock_chat_completion.call_args
-        assert kwargs == {
-            "stop": [],
-            "stream": True,
-            "max_tokens": 512,
-            "stream_options": ChatCompletionInputStreamOptions(include_usage=True),
-        }
-
-        # Assert that the streaming callback was called twice
-        assert streaming_call_count == 2
-
-        # Assert that the response contains the generated replies
-        assert "replies" in response
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) > 0
-        assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
-
-    def test_run_fail_with_tools_and_streaming(self, tools, mock_check_valid_model):
-        component = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-            streaming_callback=streaming_callback_handler,
-        )
-
-        with pytest.raises(ValueError):
-            message = ChatMessage.from_user("irrelevant")
-            component.run([message], tools=tools)
-
-    def test_run_with_tools(self, mock_check_valid_model, tools):
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "meta-llama/Llama-3.1-70B-Instruct"},
-            tools=tools,
-        )
-
-        with patch("huggingface_hub.InferenceClient.chat_completion", autospec=True) as mock_chat_completion:
-            completion = ChatCompletionOutput(
-                choices=[
-                    ChatCompletionOutputComplete(
-                        finish_reason="stop",
-                        index=0,
-                        message=ChatCompletionOutputMessage(
-                            role="assistant",
-                            content=None,
-                            tool_calls=[
-                                ChatCompletionOutputToolCall(
-                                    function=ChatCompletionOutputFunctionDefinition(
-                                        arguments={"city": "Paris"}, name="weather", description=None
-                                    ),
-                                    id="0",
-                                    type="function",
-                                )
-                            ],
-                        ),
-                        logprobs=None,
-                    )
-                ],
-                created=1729074760,
-                id="",
-                model="meta-llama/Llama-3.1-70B-Instruct",
-                system_fingerprint="2.3.2-dev0-sha-28bb7ae",
-                usage=ChatCompletionOutputUsage(completion_tokens=30, prompt_tokens=426, total_tokens=456),
-            )
-            mock_chat_completion.return_value = completion
-
-            messages = [ChatMessage.from_user("What is the weather in Paris?")]
-            response = generator.run(messages=messages)
-
-        assert isinstance(response, dict)
-        assert "replies" in response
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) == 1
-        assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
-        assert response["replies"][0].tool_calls[0].tool_name == "weather"
-        assert response["replies"][0].tool_calls[0].arguments == {"city": "Paris"}
-        assert response["replies"][0].tool_calls[0].id == "0"
-        assert response["replies"][0].meta == {
-            "finish_reason": "tool_calls",
-            "index": 0,
-            "model": "meta-llama/Llama-3.1-70B-Instruct",
-            "usage": {"completion_tokens": 30, "prompt_tokens": 426},
-        }
-
-    def test_convert_hfapi_tool_calls_empty(self):
-        hfapi_tool_calls = None
-        tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls)
-        assert len(tool_calls) == 0
-
-        hfapi_tool_calls = []
-        tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls)
-        assert len(tool_calls) == 0
-
-    def test_convert_hfapi_tool_calls_dict_arguments(self):
-        hfapi_tool_calls = [
-            ChatCompletionOutputToolCall(
-                function=ChatCompletionOutputFunctionDefinition(
-                    arguments={"city": "Paris"}, name="weather", description=None
-                ),
-                id="0",
-                type="function",
-            )
-        ]
-        tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls)
-        assert len(tool_calls) == 1
-        assert tool_calls[0].tool_name == "weather"
-        assert tool_calls[0].arguments == {"city": "Paris"}
-        assert tool_calls[0].id == "0"
-
-    def test_convert_hfapi_tool_calls_str_arguments(self):
-        hfapi_tool_calls = [
-            ChatCompletionOutputToolCall(
-                function=ChatCompletionOutputFunctionDefinition(
-                    arguments='{"city": "Paris"}', name="weather", description=None
-                ),
-                id="0",
-                type="function",
-            )
-        ]
-        tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls)
-        assert len(tool_calls) == 1
-        assert tool_calls[0].tool_name == "weather"
-        assert tool_calls[0].arguments == {"city": "Paris"}
-        assert tool_calls[0].id == "0"
-
-    def test_convert_hfapi_tool_calls_invalid_str_arguments(self):
-        hfapi_tool_calls = [
-            ChatCompletionOutputToolCall(
-                function=ChatCompletionOutputFunctionDefinition(
-                    arguments="not a valid JSON string", name="weather", description=None
-                ),
-                id="0",
-                type="function",
-            )
-        ]
-        tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls)
-        assert len(tool_calls) == 0
-
-    def test_convert_hfapi_tool_calls_invalid_type_arguments(self):
-        hfapi_tool_calls = [
-            ChatCompletionOutputToolCall(
-                function=ChatCompletionOutputFunctionDefinition(
-                    arguments=["this", "is", "a", "list"], name="weather", description=None
-                ),
-                id="0",
-                type="function",
-            )
-        ]
-        tool_calls = _convert_hfapi_tool_calls(hfapi_tool_calls)
-        assert len(tool_calls) == 0
-
-    @pytest.mark.parametrize(
-        "hf_stream_output, expected_stream_chunk, dummy_previous_chunks",
-        [
-            (
-                ChatCompletionStreamOutput(
-                    choices=[
-                        ChatCompletionStreamOutputChoice(
-                            delta=ChatCompletionStreamOutputDelta(role="assistant", content=" Paris"), index=0
-                        )
-                    ],
-                    created=1748339326,
-                    id="",
-                    model="microsoft/Phi-3.5-mini-instruct",
-                    system_fingerprint="3.2.1-sha-4d28897",
-                ),
-                StreamingChunk(
-                    content=" Paris",
-                    meta={
-                        "received_at": "2025-05-27T12:14:28.228852",
-                        "model": "microsoft/Phi-3.5-mini-instruct",
-                        "finish_reason": None,
-                    },
-                    index=0,
-                    start=True,
-                ),
-                [],
-            ),
-            (
-                ChatCompletionStreamOutput(
-                    choices=[
-                        ChatCompletionStreamOutputChoice(
-                            delta=ChatCompletionStreamOutputDelta(role="assistant", content=""),
-                            index=0,
-                            finish_reason="stop",
-                        )
-                    ],
-                    created=1748339326,
-                    id="",
-                    model="microsoft/Phi-3.5-mini-instruct",
-                    system_fingerprint="3.2.1-sha-4d28897",
-                ),
-                StreamingChunk(
-                    content="",
-                    meta={
-                        "received_at": "2025-05-27T12:14:28.228852",
-                        "model": "microsoft/Phi-3.5-mini-instruct",
-                        "finish_reason": "stop",
-                    },
-                    finish_reason="stop",
-                ),
-                [0],
-            ),
-            (
-                ChatCompletionStreamOutput(
-                    choices=[],
-                    created=1748339326,
-                    id="",
-                    model="microsoft/Phi-3.5-mini-instruct",
-                    system_fingerprint="3.2.1-sha-4d28897",
-                    usage=ChatCompletionStreamOutputUsage(completion_tokens=2, prompt_tokens=21, total_tokens=23),
-                ),
-                StreamingChunk(
-                    content="",
-                    meta={
-                        "received_at": "2025-05-27T12:14:28.228852",
-                        "model": "microsoft/Phi-3.5-mini-instruct",
-                        "usage": {"completion_tokens": 2, "prompt_tokens": 21},
-                    },
-                ),
-                [0, 1],
-            ),
-        ],
-    )
-    def test_convert_chat_completion_stream_output_to_streaming_chunk(
-        self, hf_stream_output, expected_stream_chunk, dummy_previous_chunks
-    ):
-        converted_stream_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(
-            chunk=hf_stream_output, previous_chunks=dummy_previous_chunks
-        )
-        # Remove timestamp from comparison since it's always the current time
-        converted_stream_chunk.meta.pop("received_at", None)
-        expected_stream_chunk.meta.pop("received_at", None)
-        assert converted_stream_chunk == expected_stream_chunk
-
-    @pytest.mark.integration
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        not os.environ.get("HF_API_TOKEN", None),
-        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
-    )
-    @pytest.mark.flaky(reruns=2, reruns_delay=10)
-    def test_live_run_serverless(self):
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "Qwen/Qwen2.5-7B-Instruct", "provider": "together"},
-            generation_kwargs={"max_tokens": 20},
-        )
-
-        # No need for instruction tokens here since we use the chat_completion endpoint which handles the chat
-        # templating for us.
-        messages = [
-            ChatMessage.from_user("What is the capital of France? Be concise only provide the capital, nothing else.")
-        ]
-        response = generator.run(messages=messages)
-
-        assert "replies" in response
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) > 0
-        assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
-        assert response["replies"][0].text is not None
-        meta = response["replies"][0].meta
-        assert "usage" in meta
-        assert "prompt_tokens" in meta["usage"]
-        assert meta["usage"]["prompt_tokens"] > 0
-        assert "completion_tokens" in meta["usage"]
-        assert meta["usage"]["completion_tokens"] > 0
-        assert meta["model"] == "Qwen/Qwen2.5-7B-Instruct"
-        assert meta["finish_reason"] is not None
-
-    @pytest.mark.integration
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        not os.environ.get("HF_API_TOKEN", None),
-        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
-    )
-    @pytest.mark.flaky(reruns=2, reruns_delay=10)
-    def test_live_run_serverless_streaming(self):
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "Qwen/Qwen2.5-7B-Instruct", "provider": "together"},
-            generation_kwargs={"max_tokens": 20},
-            streaming_callback=streaming_callback_handler,
-        )
-
-        # No need for instruction tokens here since we use the chat_completion endpoint which handles the chat
-        # templating for us.
-        messages = [
-            ChatMessage.from_user("What is the capital of France? Be concise only provide the capital, nothing else.")
-        ]
-        response = generator.run(messages=messages)
-
-        print(response)
-
-        assert "replies" in response
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) > 0
-        assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
-        assert response["replies"][0].text is not None
-
-        response_meta = response["replies"][0].meta
-        assert "completion_start_time" in response_meta
-        assert datetime.fromisoformat(response_meta["completion_start_time"]) <= datetime.now()
-        assert "usage" in response_meta
-        assert "prompt_tokens" in response_meta["usage"]
-        assert response_meta["usage"]["prompt_tokens"] >= 0
-        assert "completion_tokens" in response_meta["usage"]
-        assert response_meta["usage"]["completion_tokens"] >= 0
-        # internally, Together calls this "Qwen/Qwen2.5-7B-Instruct-Turbo"
-        assert "Qwen/Qwen2.5-7B-Instruct" in response_meta["model"]
-        assert response_meta["finish_reason"] is not None
-
-    @pytest.mark.integration
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        not os.environ.get("HF_API_TOKEN", None),
-        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
-    )
-    def test_live_run_with_tools(self, tools):
-        """
-        We test the round trip: generate tool call, pass tool message, generate response.
-
-        The model used here is not gated and kept in a warm state.
-        """
-
-        chat_messages = [ChatMessage.from_user("What's the weather like in Paris?")]
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "Qwen/Qwen3.5-9B", "provider": "together"},
-            generation_kwargs={"temperature": 0.5, "extra_body": {"chat_template_kwargs": {"enable_thinking": False}}},
-        )
-
-        results = generator.run(chat_messages, tools=tools)
-        assert len(results["replies"]) == 1
-        message = results["replies"][0]
-
-        assert message.tool_calls
-        tool_call = message.tool_call
-        assert isinstance(tool_call, ToolCall)
-        assert tool_call.tool_name == "weather"
-        assert "city" in tool_call.arguments
-        assert "Paris" in tool_call.arguments["city"]
-        assert message.meta["finish_reason"] == "tool_calls"
-
-        new_messages = chat_messages + [message, ChatMessage.from_tool(tool_result="22° C", origin=tool_call)]
-
-        # the model tends to make tool calls if provided with tools, so we don't pass them here
-        results = generator.run(new_messages, generation_kwargs={"max_tokens": 50})
-
-        assert len(results["replies"]) == 1
-        final_message = results["replies"][0]
-        assert not final_message.tool_calls
-        assert len(final_message.text) > 0
-        assert "paris" in final_message.text.lower() and "22" in final_message.text
-
-    @pytest.mark.integration
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        not os.environ.get("HF_API_TOKEN", None),
-        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
-    )
-    def test_live_run_multimodal(self, test_files_path):
-        image_path = test_files_path / "images" / "apple.jpg"
-        # Resize the image to keep this test fast
-        image_content = ImageContent.from_file_path(file_path=image_path, size=(100, 100))
-        messages = [ChatMessage.from_user(content_parts=["What does this image show? Max 5 words", image_content])]
-
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "Qwen/Qwen3.5-9B", "provider": "together"},
-            generation_kwargs={"max_tokens": 20, "extra_body": {"chat_template_kwargs": {"enable_thinking": False}}},
-        )
-
-        response = generator.run(messages=messages)
-
-        assert "replies" in response
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) > 0
-        message = response["replies"][0]
-        assert message.text
-        assert len(message.text) > 0
-        assert any(word in message.text.lower() for word in ["apple", "fruit", "red"])
-
-    @pytest.mark.asyncio
-    async def test_run_async(self, mock_check_valid_model, mock_chat_completion_async, chat_messages):
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-            generation_kwargs={"temperature": 0.6},
-            stop_words=["stop", "words"],
-            streaming_callback=None,
-        )
-
-        response = await generator.run_async(messages=chat_messages)
-
-        # check kwargs passed to chat_completion
-        _, kwargs = mock_chat_completion_async.call_args
-        hf_messages = [
-            {"role": "system", "content": "You are a helpful assistant speaking A2 level of English"},
-            {"role": "user", "content": "Tell me about Berlin"},
-        ]
-        assert kwargs == {
-            "temperature": 0.6,
-            "stop": ["stop", "words"],
-            "max_tokens": 512,
-            "tools": None,
-            "messages": hf_messages,
-        }
-
-        assert isinstance(response, dict)
-        assert "replies" in response
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) == 1
-        assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
-
-    async def test_run_async_with_string_input(self, mock_check_valid_model, mock_chat_completion_async):
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-        )
-        response = await generator.run_async("What's the capital of France?")
-
-        _, kwargs = mock_chat_completion_async.call_args
-        assert kwargs["messages"] == [{"role": "user", "content": "What's the capital of France?"}]
-
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) == 1
-        assert isinstance(response["replies"][0], ChatMessage)
-
-    @pytest.mark.asyncio
-    async def test_run_async_with_streaming(self, mock_check_valid_model, mock_chat_completion_async, chat_messages):
-        streaming_call_count = 0
-
-        async def streaming_callback_fn(chunk: StreamingChunk):
-            nonlocal streaming_call_count
-            streaming_call_count += 1
-            assert isinstance(chunk, StreamingChunk)
-
-        # Create a fake streamed response
-        async def mock_aiter(self):
-            yield ChatCompletionStreamOutput(
-                choices=[
-                    ChatCompletionStreamOutputChoice(
-                        delta=ChatCompletionStreamOutputDelta(content="The", role="assistant"),
-                        index=0,
-                        finish_reason=None,
-                    )
-                ],
-                id="some_id",
-                model="some_model",
-                system_fingerprint="some_fingerprint",
-                created=1710498504,
-            )
-
-            yield ChatCompletionStreamOutput(
-                choices=[
-                    ChatCompletionStreamOutputChoice(
-                        delta=ChatCompletionStreamOutputDelta(content=None, role=None), index=0, finish_reason="length"
-                    )
-                ],
-                id="some_id",
-                model="some_model",
-                system_fingerprint="some_fingerprint",
-                created=1710498504,
-            )
-
-        mock_response = Mock(__aiter__=mock_aiter)
-        mock_chat_completion_async.return_value = mock_response
-
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-            streaming_callback=streaming_callback_fn,
-        )
-
-        response = await generator.run_async(messages=chat_messages)
-
-        # check kwargs passed to chat_completion
-        _, kwargs = mock_chat_completion_async.call_args
-        assert kwargs == {
-            "stop": [],
-            "stream": True,
-            "max_tokens": 512,
-            "stream_options": ChatCompletionInputStreamOptions(include_usage=True),
-        }
-
-        # Assert that the streaming callback was called twice
-        assert streaming_call_count == 2
-
-        # Assert that the response contains the generated replies
-        assert "replies" in response
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) > 0
-        assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
-
-    @pytest.mark.asyncio
-    async def test_run_async_with_tools(self, tools, mock_check_valid_model):
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "meta-llama/Llama-3.1-70B-Instruct"},
-            tools=tools,
-        )
-
-        with patch("huggingface_hub.AsyncInferenceClient.chat_completion", autospec=True) as mock_chat_completion_async:
-            completion = ChatCompletionOutput(
-                choices=[
-                    ChatCompletionOutputComplete(
-                        finish_reason="stop",
-                        index=0,
-                        message=ChatCompletionOutputMessage(
-                            role="assistant",
-                            content=None,
-                            tool_calls=[
-                                ChatCompletionOutputToolCall(
-                                    function=ChatCompletionOutputFunctionDefinition(
-                                        arguments={"city": "Paris"}, name="weather", description=None
-                                    ),
-                                    id="0",
-                                    type="function",
-                                )
-                            ],
-                        ),
-                        logprobs=None,
-                    )
-                ],
-                created=1729074760,
-                id="",
-                model="meta-llama/Llama-3.1-70B-Instruct",
-                system_fingerprint="2.3.2-dev0-sha-28bb7ae",
-                usage=ChatCompletionOutputUsage(completion_tokens=30, prompt_tokens=426, total_tokens=456),
-            )
-            mock_chat_completion_async.return_value = completion
-
-            messages = [ChatMessage.from_user("What is the weather in Paris?")]
-            response = await generator.run_async(messages=messages)
-
-        assert isinstance(response, dict)
-        assert "replies" in response
-        assert isinstance(response["replies"], list)
-        assert len(response["replies"]) == 1
-        assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
-        assert response["replies"][0].tool_calls[0].tool_name == "weather"
-        assert response["replies"][0].tool_calls[0].arguments == {"city": "Paris"}
-        assert response["replies"][0].tool_calls[0].id == "0"
-        assert response["replies"][0].meta == {
-            "finish_reason": "tool_calls",
-            "index": 0,
-            "model": "meta-llama/Llama-3.1-70B-Instruct",
-            "usage": {"completion_tokens": 30, "prompt_tokens": 426},
-        }
-
-    @pytest.mark.integration
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        not os.environ.get("HF_API_TOKEN", None),
-        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
-    )
-    @pytest.mark.flaky(reruns=2, reruns_delay=10)
-    @pytest.mark.asyncio
-    async def test_live_run_async_serverless(self):
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "Qwen/Qwen2.5-7B-Instruct", "provider": "together"},
-            generation_kwargs={"max_tokens": 20},
-        )
-
-        messages = [
-            ChatMessage.from_user("What is the capital of France? Be concise only provide the capital, nothing else.")
-        ]
-        try:
-            response = await generator.run_async(messages=messages)
-
-            assert "replies" in response
-            assert isinstance(response["replies"], list)
-            assert len(response["replies"]) > 0
-            assert [isinstance(reply, ChatMessage) for reply in response["replies"]]
-            assert response["replies"][0].text is not None
-
-            meta = response["replies"][0].meta
-            assert "usage" in meta
-            assert "prompt_tokens" in meta["usage"]
-            assert meta["usage"]["prompt_tokens"] > 0
-            assert "completion_tokens" in meta["usage"]
-            assert meta["usage"]["completion_tokens"] > 0
-            assert meta["model"] == "Qwen/Qwen2.5-7B-Instruct"
-            assert meta["finish_reason"] is not None
-        finally:
-            await generator._async_client.close()
-
-    @pytest.mark.integration
-    @pytest.mark.slow
-    @pytest.mark.skipif(
-        not os.environ.get("HF_API_TOKEN", None),
-        reason="Export an env var called HF_API_TOKEN containing the Hugging Face token to run this test.",
-    )
-    @pytest.mark.flaky(reruns=2, reruns_delay=10)
-    def test_live_run_multi_turn_with_reasoning_model(self):
-        """
-        Test multi-turn conversation with a reasoning model.
-
-        This test verifies that:
-        1. Reasoning content is captured from the model's response
-        2. When the assistant message (with reasoning) is sent back in a multi-turn conversation,
-           the API call succeeds (reasoning is dropped during conversion since HF API doesn't support it)
-        """
-        # Note: Using a model that supports reasoning AND a provider that actually follows the spec defined in
-        # huggingface-hub. Reasoning content especially seems to be non-standard across providers and is either left
-        # in the main response or put in a new field that is not part of the official API.
-        # One combo that does respect the spec is together + openai/gpt-oss-20b.
-        # together + openai/gpt-oss-20b actually uses the expected reasoning field in the response
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            # We use together + openai/gpt-oss-20b since it actually returns reasoning content in the expected field
-            api_params={"model": "openai/gpt-oss-20b", "provider": "together"},
-            generation_kwargs={"max_tokens": 300},
-        )
-
-        # First turn: ask a question
-        messages = [ChatMessage.from_user("What is 2 + 2? Answer briefly.")]
-        response = generator.run(messages=messages)
-
-        assert "replies" in response
-        assert len(response["replies"]) > 0
-        first_reply = response["replies"][0]
-        assert first_reply.text is not None
-        assert first_reply.reasoning is not None
-
-        # Second turn: send a follow-up including the assistant's previous response
-        # This tests that convert_message_to_hf_format properly handles messages
-        # that may contain ReasoningContent (it should skip it)
-        follow_up_messages = [
-            ChatMessage.from_user("What is 2 + 2? Answer briefly."),
-            first_reply,  # Include the assistant's response with reasoning
-            ChatMessage.from_user("Now what is 3 + 3? Answer briefly."),
-        ]
-        follow_up_response = generator.run(messages=follow_up_messages)
-
-        # Verify the second turn succeeds
-        assert "replies" in follow_up_response
-        assert len(follow_up_response["replies"]) > 0
-        assert follow_up_response["replies"][0].text is not None
-        assert follow_up_response["replies"][0].reasoning is not None
-
-    def test_hugging_face_api_generator_with_toolset_initialization(self, mock_check_valid_model, tools):
-        """Test that the HuggingFaceAPIChatGenerator can be initialized with a Toolset."""
-        toolset = Toolset(tools)
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "irrelevant"}, tools=toolset
-        )
-        assert generator.tools == toolset
-
-    def test_from_dict_with_toolset(self, mock_check_valid_model, tools):
-        """Test that the HuggingFaceAPIChatGenerator can be deserialized from a dictionary with a Toolset."""
-        toolset = Toolset(tools)
-        component = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "irrelevant"}, tools=toolset
-        )
-        data = component.to_dict()
-
-        deserialized_component = HuggingFaceAPIChatGenerator.from_dict(data)
-
-        assert isinstance(deserialized_component.tools, Toolset)
-        assert len(deserialized_component.tools) == len(tools)
-        assert all(isinstance(tool, Tool) for tool in deserialized_component.tools)
-
-    def test_to_dict_with_toolset(self, mock_check_valid_model, tools):
-        """Test that the HuggingFaceAPIChatGenerator can be serialized to a dictionary with a Toolset."""
-        toolset = Toolset(tools[:1])
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "irrelevant"}, tools=toolset
-        )
-        data = generator.to_dict()
-
-        expected_tools_data = {
-            "type": "haystack.tools.toolset.Toolset",
-            "data": {
-                "tools": [
-                    {
-                        "type": "haystack.tools.tool.Tool",
-                        "data": {
-                            "name": "weather",
-                            "description": "useful to determine the weather in a given location",
-                            "parameters": {
-                                "type": "object",
-                                "properties": {"city": {"type": "string"}},
-                                "required": ["city"],
-                            },
-                            "function": "generators.chat.test_hugging_face_api.get_weather",
-                            "async_function": None,
-                            "outputs_to_string": None,
-                            "inputs_from_state": None,
-                            "outputs_to_state": None,
-                        },
-                    }
-                ]
-            },
-        }
-        assert data["init_parameters"]["tools"] == expected_tools_data
-
-    def test_convert_tools_to_hfapi_tools(self):
-        assert _convert_tools_to_hfapi_tools(None) is None
-        assert _convert_tools_to_hfapi_tools([]) is None
-
-        tool = Tool(
-            name="weather",
-            description="useful to determine the weather in a given location",
-            parameters={"city": {"type": "string"}},
-            function=get_weather,
-        )
-        hf_tools = _convert_tools_to_hfapi_tools([tool])
-        assert len(hf_tools) == 1
-        assert hf_tools[0].type == "function"
-        assert hf_tools[0].function.name == "weather"
-        assert hf_tools[0].function.description == "useful to determine the weather in a given location"
-        assert hf_tools[0].function.parameters == {"city": {"type": "string"}}
-
-    def test_convert_tools_to_hfapi_tools_legacy(self):
-        # this satisfies the check hasattr(ChatCompletionInputFunctionDefinition, "arguments")
-        mock_class = MagicMock()
-
-        with patch(
-            "haystack.components.generators.chat.hugging_face_api.ChatCompletionInputFunctionDefinition", mock_class
-        ):
-            tool = Tool(
-                name="weather",
-                description="useful to determine the weather in a given location",
-                parameters={"city": {"type": "string"}},
-                function=get_weather,
-            )
-            _convert_tools_to_hfapi_tools([tool])
-
-        mock_class.assert_called_once_with(
-            name="weather",
-            arguments={"city": {"type": "string"}},
-            description="useful to determine the weather in a given location",
-        )
-
-    def test_warm_up_with_tools(self, mock_check_valid_model):
-        """Test that warm_up() calls warm_up on tools and is idempotent."""
-
-        # Create a mock tool that tracks if warm_up() was called
-        class MockTool(Tool):
-            warm_up_call_count = 0  # Class variable to track calls
-
-            def __init__(self):
-                super().__init__(
-                    name="mock_tool",
-                    description="A mock tool for testing",
-                    parameters={"x": {"type": "string"}},
-                    function=lambda x: x,
-                )
-
-            def warm_up(self):
-                MockTool.warm_up_call_count += 1
-
-        # Reset the class variable before test
-        MockTool.warm_up_call_count = 0
-        mock_tool = MockTool()
-
-        # Create HuggingFaceAPIChatGenerator with the mock tool
-        component = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "HuggingFaceH4/zephyr-7b-alpha"},
-            tools=[mock_tool],
-        )
-
-        # Verify initial state - warm_up not called yet
-        assert MockTool.warm_up_call_count == 0
-        assert not component._is_warmed_up
-
-        # Call warm_up() on the generator
-        component.warm_up()
-
-        # Assert that the tool's warm_up() was called
-        assert MockTool.warm_up_call_count == 1
-        assert component._is_warmed_up
-
-        # Call warm_up() again and verify it's idempotent (only warms up once)
-        component.warm_up()
-
-        # The tool's warm_up should still only have been called once
-        assert MockTool.warm_up_call_count == 1
-        assert component._is_warmed_up
-
-    def test_warm_up_with_no_tools(self, mock_check_valid_model):
-        """Test that warm_up() works when no tools are provided."""
-        component = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API, api_params={"model": "HuggingFaceH4/zephyr-7b-alpha"}
-        )
-
-        # Verify initial state
-        assert not component._is_warmed_up
-        assert component.tools is None
-
-        # Call warm_up() - should not raise an error
-        component.warm_up()
-
-        # Verify the component is warmed up
-        assert component._is_warmed_up
-
-        # Call warm_up() again - should be idempotent
-        component.warm_up()
-        assert component._is_warmed_up
-
-    def test_warm_up_with_multiple_tools(self, mock_check_valid_model):
-        """Test that warm_up() works with multiple tools."""
-        # Track warm_up calls
-        warm_up_calls = []
-
-        class MockTool(Tool):
-            def __init__(self, tool_name):
-                super().__init__(
-                    name=tool_name,
-                    description=f"Mock tool {tool_name}",
-                    parameters={"type": "object", "properties": {"x": {"type": "string"}}, "required": ["x"]},
-                    function=lambda x: f"{tool_name} result: {x}",
-                )
-
-            def warm_up(self):
-                warm_up_calls.append(self.name)
-
-        mock_tool1 = MockTool("tool1")
-        mock_tool2 = MockTool("tool2")
-
-        # Use a LIST of tools, not a Toolset
-        component = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "HuggingFaceH4/zephyr-7b-alpha"},
-            tools=[mock_tool1, mock_tool2],
-        )
-
-        # Call warm_up()
-        component.warm_up()
-
-        # Assert that both tools' warm_up() were called
-        assert "tool1" in warm_up_calls
-        assert "tool2" in warm_up_calls
-        assert component._is_warmed_up
-
-        # Track count
-        call_count = len(warm_up_calls)
-
-        # Verify idempotency
-        component.warm_up()
-        assert len(warm_up_calls) == call_count
-
-    def test_run_with_reasoning_non_streaming(self, mock_check_valid_model, chat_messages):
-        """Test that reasoning content is correctly extracted from non-streaming responses."""
-        with patch("huggingface_hub.InferenceClient.chat_completion", autospec=True) as mock_chat_completion:
-            reasoning_text = "Let me think about this. France is a country in Europe. Its capital city is Paris."
-            completion = ChatCompletionOutput(
-                choices=[
-                    ChatCompletionOutputComplete(
-                        finish_reason="eos_token",
-                        index=0,
-                        message=ChatCompletionOutputMessage(
-                            content="The capital of France is Paris.", role="assistant", reasoning=reasoning_text
-                        ),
-                    )
-                ],
-                id="some_id",
-                model="some_model",
-                system_fingerprint="some_fingerprint",
-                usage=ChatCompletionOutputUsage(completion_tokens=20, prompt_tokens=17, total_tokens=37),
-                created=1710498360,
-            )
-            mock_chat_completion.return_value = completion
-
-            generator = HuggingFaceAPIChatGenerator(
-                api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-            )
-
-            response = generator.run(chat_messages)
-
-            assert "replies" in response
-            assert len(response["replies"]) == 1
-            reply = response["replies"][0]
-            assert reply.text == "The capital of France is Paris."
-            assert reply.reasoning is not None
-            assert isinstance(reply.reasoning, ReasoningContent)
-            assert reply.reasoning.reasoning_text == reasoning_text
-
-    def test_run_without_reasoning_non_streaming(self, mock_check_valid_model, mock_chat_completion, chat_messages):
-        """Test that responses without reasoning work correctly (backward compatibility)."""
-        generator = HuggingFaceAPIChatGenerator(
-            api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-            api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-        )
-
-        response = generator.run(chat_messages)
-
-        assert "replies" in response
-        assert len(response["replies"]) == 1
-        reply = response["replies"][0]
-        assert reply.text == "The capital of France is Paris."
-        assert reply.reasoning is None
-
-    def test_run_with_reasoning_streaming(self, mock_check_valid_model, chat_messages):
-        """Test that reasoning content is correctly extracted from streaming responses."""
-        streaming_chunks_received = []
-
-        def streaming_callback_fn(chunk: StreamingChunk):
-            streaming_chunks_received.append(chunk)
-
-        with patch("huggingface_hub.InferenceClient.chat_completion", autospec=True) as mock_chat_completion:
-            # Create a fake streamed response with reasoning
-            def mock_iter(self):
-                # First chunk with reasoning
-                yield ChatCompletionStreamOutput(
-                    choices=[
-                        ChatCompletionStreamOutputChoice(
-                            delta=ChatCompletionStreamOutputDelta(
-                                role="assistant", content=None, reasoning="Let me think..."
-                            ),
-                            index=0,
-                            finish_reason=None,
-                        )
-                    ],
-                    id="some_id",
-                    model="some_model",
-                    system_fingerprint="some_fingerprint",
-                    created=1710498504,
-                )
-                # Second chunk with more reasoning
-                yield ChatCompletionStreamOutput(
-                    choices=[
-                        ChatCompletionStreamOutputChoice(
-                            delta=ChatCompletionStreamOutputDelta(
-                                role=None, content=None, reasoning=" The capital of France is Paris."
-                            ),
-                            index=0,
-                            finish_reason=None,
-                        )
-                    ],
-                    id="some_id",
-                    model="some_model",
-                    system_fingerprint="some_fingerprint",
-                    created=1710498504,
-                )
-                # Third chunk with actual content
-                yield ChatCompletionStreamOutput(
-                    choices=[
-                        ChatCompletionStreamOutputChoice(
-                            delta=ChatCompletionStreamOutputDelta(role=None, content="Paris", reasoning=None),
-                            index=0,
-                            finish_reason=None,
-                        )
-                    ],
-                    id="some_id",
-                    model="some_model",
-                    system_fingerprint="some_fingerprint",
-                    created=1710498504,
-                )
-                # Final chunk with finish reason
-                yield ChatCompletionStreamOutput(
-                    choices=[
-                        ChatCompletionStreamOutputChoice(
-                            delta=ChatCompletionStreamOutputDelta(role=None, content=None, reasoning=None),
-                            index=0,
-                            finish_reason="stop",
-                        )
-                    ],
-                    id="some_id",
-                    model="some_model",
-                    system_fingerprint="some_fingerprint",
-                    created=1710498504,
-                )
-
-            mock_response = Mock(__iter__=mock_iter)
-            mock_chat_completion.return_value = mock_response
-
-            generator = HuggingFaceAPIChatGenerator(
-                api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-                streaming_callback=streaming_callback_fn,
-            )
-
-            response = generator.run(chat_messages)
-
-            # Check streaming chunks received with reasoning
-            assert len(streaming_chunks_received) == 4
-            assert streaming_chunks_received[0].reasoning is not None
-            assert streaming_chunks_received[0].reasoning.reasoning_text == "Let me think..."
-            assert streaming_chunks_received[1].reasoning is not None
-            assert streaming_chunks_received[1].reasoning.reasoning_text == " The capital of France is Paris."
-
-            # Check final message
-            assert "replies" in response
-            assert len(response["replies"]) == 1
-            reply = response["replies"][0]
-            assert reply.text == "Paris"
-            assert reply.reasoning is not None
-            assert isinstance(reply.reasoning, ReasoningContent)
-            assert reply.reasoning.reasoning_text == "Let me think... The capital of France is Paris."
-
-    @pytest.mark.asyncio
-    async def test_run_async_with_reasoning_non_streaming(self, mock_check_valid_model, chat_messages):
-        """Test that reasoning content is correctly extracted from async non-streaming responses."""
-        with patch(
-            "huggingface_hub.AsyncInferenceClient.chat_completion", new_callable=AsyncMock
-        ) as mock_chat_completion:
-            completion = ChatCompletionOutput(
-                choices=[
-                    ChatCompletionOutputComplete(
-                        finish_reason="eos_token",
-                        index=0,
-                        message=ChatCompletionOutputMessage(
-                            content="The capital of France is Paris.",
-                            role="assistant",
-                            reasoning="Let me reason about this question step by step.",
-                        ),
-                    )
-                ],
-                id="some_id",
-                model="some_model",
-                system_fingerprint="some_fingerprint",
-                usage=ChatCompletionOutputUsage(completion_tokens=20, prompt_tokens=17, total_tokens=37),
-                created=1710498360,
-            )
-            mock_chat_completion.return_value = completion
-
-            generator = HuggingFaceAPIChatGenerator(
-                api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-            )
-
-            response = await generator.run_async(chat_messages)
-
-            assert "replies" in response
-            assert len(response["replies"]) == 1
-            reply = response["replies"][0]
-            assert reply.text == "The capital of France is Paris."
-            assert reply.reasoning is not None
-            assert isinstance(reply.reasoning, ReasoningContent)
-            assert reply.reasoning.reasoning_text == "Let me reason about this question step by step."
-
-    @pytest.mark.asyncio
-    async def test_run_async_with_reasoning_streaming(self, mock_check_valid_model, chat_messages):
-        """Test that reasoning content is correctly extracted from async streaming responses."""
-        streaming_chunks_received = []
-
-        async def streaming_callback_fn(chunk: StreamingChunk):
-            streaming_chunks_received.append(chunk)
-
-        with patch(
-            "huggingface_hub.AsyncInferenceClient.chat_completion", new_callable=AsyncMock
-        ) as mock_chat_completion:
-            # Create async iterable for streaming
-            async def mock_aiter():
-                # First chunk with reasoning
-                yield ChatCompletionStreamOutput(
-                    choices=[
-                        ChatCompletionStreamOutputChoice(
-                            delta=ChatCompletionStreamOutputDelta(
-                                role="assistant", content=None, reasoning="Thinking..."
-                            ),
-                            index=0,
-                            finish_reason=None,
-                        )
-                    ],
-                    id="some_id",
-                    model="some_model",
-                    system_fingerprint="some_fingerprint",
-                    created=1710498504,
-                )
-                # Second chunk with content
-                yield ChatCompletionStreamOutput(
-                    choices=[
-                        ChatCompletionStreamOutputChoice(
-                            delta=ChatCompletionStreamOutputDelta(role=None, content="Paris", reasoning=None),
-                            index=0,
-                            finish_reason=None,
-                        )
-                    ],
-                    id="some_id",
-                    model="some_model",
-                    system_fingerprint="some_fingerprint",
-                    created=1710498504,
-                )
-                # Final chunk
-                yield ChatCompletionStreamOutput(
-                    choices=[
-                        ChatCompletionStreamOutputChoice(
-                            delta=ChatCompletionStreamOutputDelta(role=None, content=None, reasoning=None),
-                            index=0,
-                            finish_reason="stop",
-                        )
-                    ],
-                    id="some_id",
-                    model="some_model",
-                    system_fingerprint="some_fingerprint",
-                    created=1710498504,
-                )
-
-            mock_chat_completion.return_value = mock_aiter()
-
-            generator = HuggingFaceAPIChatGenerator(
-                api_type=HFGenerationAPIType.SERVERLESS_INFERENCE_API,
-                api_params={"model": "meta-llama/Llama-2-13b-chat-hf"},
-                streaming_callback=streaming_callback_fn,
-            )
-
-            response = await generator.run_async(chat_messages)
-
-            # Check streaming chunks
-            assert len(streaming_chunks_received) == 3
-            assert streaming_chunks_received[0].reasoning is not None
-            assert streaming_chunks_received[0].reasoning.reasoning_text == "Thinking..."
-
-            # Check final message
-            assert "replies" in response
-            assert len(response["replies"]) == 1
-            reply = response["replies"][0]
-            assert reply.text == "Paris"
-            assert reply.reasoning is not None
-            assert isinstance(reply.reasoning, ReasoningContent)
-            assert reply.reasoning.reasoning_text == "Thinking..."
-
-    def test_convert_chat_completion_stream_output_to_streaming_chunk_with_reasoning(self):
-        """Test that reasoning is correctly extracted from streaming chunks."""
-        # In streaming mode, reasoning and content come in separate chunks
-        chunk = ChatCompletionStreamOutput(
-            choices=[
-                ChatCompletionStreamOutputChoice(
-                    delta=ChatCompletionStreamOutputDelta(
-                        role="assistant", content=None, reasoning="Let me think about this."
-                    ),
-                    index=0,
-                    finish_reason=None,
-                )
-            ],
-            id="some_id",
-            model="some_model",
-            system_fingerprint="some_fingerprint",
-            created=1710498504,
-        )
-
-        streaming_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(chunk=chunk, previous_chunks=[])
-
-        assert streaming_chunk.content == ""
-        assert streaming_chunk.reasoning is not None
-        assert isinstance(streaming_chunk.reasoning, ReasoningContent)
-        assert streaming_chunk.reasoning.reasoning_text == "Let me think about this."
-
-    def test_convert_chat_completion_stream_output_to_streaming_chunk_without_reasoning(self):
-        """Test that chunks without reasoning still work correctly."""
-        chunk = ChatCompletionStreamOutput(
-            choices=[
-                ChatCompletionStreamOutputChoice(
-                    delta=ChatCompletionStreamOutputDelta(role="assistant", content="Hello"),
-                    index=0,
-                    finish_reason=None,
-                )
-            ],
-            id="some_id",
-            model="some_model",
-            system_fingerprint="some_fingerprint",
-            created=1710498504,
-        )
-
-        streaming_chunk = _convert_chat_completion_stream_output_to_streaming_chunk(chunk=chunk, previous_chunks=[])
-
-        assert streaming_chunk.content == "Hello"
-        assert streaming_chunk.reasoning is None
-
-    def test_resolve_schema_refs_no_defs(self):
-        """Schema without $defs is returned as-is."""
-        schema = {"type": "object", "properties": {"name": {"type": "string"}}}
-        assert _resolve_schema_refs(schema) == schema
-
-    def test_resolve_schema_refs_expands_defs(self):
-        """Schema with $defs and $ref is expanded correctly."""
-        schema = {
-            "$defs": {
-                "User": {
-                    "type": "object",
-                    "properties": {"name": {"type": "string"}, "age": {"type": "integer"}},
-                    "required": ["name"],
-                }
-            },
-            "type": "object",
-            "properties": {"user": {"$ref": "#/$defs/User"}},
-            "required": ["user"],
-        }
-        resolved = _resolve_schema_refs(schema)
-        assert "$defs" not in resolved
-        assert "$ref" not in resolved["properties"]["user"]
-        assert resolved["properties"]["user"]["type"] == "object"
-        assert resolved["properties"]["user"]["properties"]["name"] == {"type": "string"}
-
-    def test_resolve_schema_refs_nested_refs(self):
-        """Schema with nested $ref references is expanded correctly."""
-        schema = {
-            "$defs": {
-                "Address": {"type": "object", "properties": {"street": {"type": "string"}}},
-                "User": {
-                    "type": "object",
-                    "properties": {"name": {"type": "string"}, "address": {"$ref": "#/$defs/Address"}},
-                },
-            },
-            "type": "object",
-            "properties": {"user": {"$ref": "#/$defs/User"}},
-        }
-        resolved = _resolve_schema_refs(schema)
-        assert "$defs" not in resolved
-        user = resolved["properties"]["user"]
-        assert user["properties"]["address"]["type"] == "object"
-        assert user["properties"]["address"]["properties"]["street"] == {"type": "string"}
-
-    def test_convert_tools_to_hfapi_tools_resolves_defs(self):
-        """Tool schemas with $defs are resolved before passing to HF API."""
-        tool = Tool(
-            name="get_user",
-            description="Get user info",
-            parameters={
-                "$defs": {"User": {"type": "object", "properties": {"name": {"type": "string"}}}},
-                "type": "object",
-                "properties": {"user": {"$ref": "#/$defs/User"}},
-            },
-            function=lambda user: user,
-        )
-        hf_tools = _convert_tools_to_hfapi_tools([tool])
-        assert hf_tools is not None
-        assert len(hf_tools) == 1
-        params = hf_tools[0].function.parameters or hf_tools[0].function.arguments
-        assert "$defs" not in params
-        assert params["properties"]["user"]["type"] == "object"
diff --git a/test/components/preprocessors/test_embedding_based_document_splitter.py b/test/components/preprocessors/test_embedding_based_document_splitter.py
index 4e5ccfa053..9bcb737197 100644
--- a/test/components/preprocessors/test_embedding_based_document_splitter.py
+++ b/test/components/preprocessors/test_embedding_based_document_splitter.py
@@ -9,7 +9,7 @@
 import pytest
 
 from haystack import Document
-from haystack.components.embedders import HuggingFaceAPIDocumentEmbedder, SentenceTransformersDocumentEmbedder
+from haystack.components.embedders import OpenAIDocumentEmbedder, SentenceTransformersDocumentEmbedder
 from haystack.components.preprocessors import EmbeddingBasedDocumentSplitter
 from haystack.utils import ComponentDevice
 
@@ -403,16 +403,10 @@ def test_split_document_with_multiple_topics(self, del_hf_env_vars, monkeypatch)
         assert combined in original or original in combined
 
     @pytest.mark.asyncio
-    @pytest.mark.skipif(
-        not os.environ.get("TEI_URL", None),
-        reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.",
-    )
-    @pytest.mark.slow
+    @pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set")
     @pytest.mark.integration
     async def test_split_document_with_multiple_topics_async(self) -> None:
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")}
-        )
+        embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
 
         splitter = EmbeddingBasedDocumentSplitter(
             document_embedder=embedder, sentences_per_group=2, percentile=0.9, min_length=30, max_length=300
@@ -467,16 +461,10 @@ def test_trailing_whitespace_is_preserved(self, del_hf_env_vars):
         assert result["documents"][0].content == text
 
     @pytest.mark.asyncio
-    @pytest.mark.skipif(
-        not os.environ.get("TEI_URL", None),
-        reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.",
-    )
-    @pytest.mark.slow
+    @pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set")
     @pytest.mark.integration
     async def test_trailing_whitespace_is_preserved_async(self) -> None:
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")}
-        )
+        embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
         splitter = EmbeddingBasedDocumentSplitter(document_embedder=embedder, sentences_per_group=1)
 
         # Normal trailing whitespace
@@ -524,16 +512,10 @@ def test_no_extra_whitespaces_between_sentences(self, del_hf_env_vars):
         )  # noqa: E501
 
     @pytest.mark.asyncio
-    @pytest.mark.skipif(
-        not os.environ.get("TEI_URL", None),
-        reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.",
-    )
+    @pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set")
     @pytest.mark.integration
-    @pytest.mark.slow
     async def test_no_extra_whitespaces_between_sentences_async(self) -> None:
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")}
-        )
+        embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
 
         splitter = EmbeddingBasedDocumentSplitter(
             document_embedder=embedder, sentences_per_group=1, percentile=0.9, min_length=10, max_length=500
@@ -600,21 +582,15 @@ def test_split_large_splits_recursion(self, del_hf_env_vars):
             assert "page_number" in split_doc.meta
 
     @pytest.mark.asyncio
-    @pytest.mark.skipif(
-        not os.environ.get("TEI_URL", None),
-        reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.",
-    )
+    @pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set")
     @pytest.mark.integration
-    @pytest.mark.slow
     async def test_split_large_splits_recursion_async(self) -> None:
         """
         Test that _split_large_splits() works correctly without infinite loops.
         This test uses a longer text that will trigger the recursive splitting logic.
         If the chunk cannot be split further, it is allowed to be larger than max_length.
         """
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")}
-        )
+        embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
         semantic_chunker = EmbeddingBasedDocumentSplitter(
             document_embedder=embedder, sentences_per_group=5, percentile=0.95, min_length=50, max_length=1000
         )
@@ -729,20 +705,14 @@ def test_split_large_splits_actually_splits(self, del_hf_env_vars):
                 assert split_doc.meta["page_number"] == 4
 
     @pytest.mark.asyncio
-    @pytest.mark.skipif(
-        not os.environ.get("TEI_URL", None),
-        reason="Export an env var called TEI_URL containing the TextEmbeddingInference url to run this test.",
-    )
+    @pytest.mark.skipif(os.environ.get("OPENAI_API_KEY", "") == "", reason="OPENAI_API_KEY is not set")
     @pytest.mark.integration
-    @pytest.mark.slow
     async def test_split_large_splits_actually_splits_async(self) -> None:
         """
         Test that _split_large_splits() actually works and can split long texts into multiple chunks.
         This test uses a very long text that should be split into multiple chunks.
         """
-        embedder = HuggingFaceAPIDocumentEmbedder(
-            api_type="text_embeddings_inference", api_params={"url": os.environ.get("TEI_URL")}
-        )
+        embedder = OpenAIDocumentEmbedder(model="text-embedding-3-small")
         semantic_chunker = EmbeddingBasedDocumentSplitter(
             document_embedder=embedder,
             sentences_per_group=3,
diff --git a/test/components/rankers/test_hugging_face_tei.py b/test/components/rankers/test_hugging_face_tei.py
deleted file mode 100644
index bcfbb06020..0000000000
--- a/test/components/rankers/test_hugging_face_tei.py
+++ /dev/null
@@ -1,351 +0,0 @@
-# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
-#
-# SPDX-License-Identifier: Apache-2.0
-
-from unittest.mock import MagicMock, patch
-
-import httpx
-import pytest
-
-from haystack import Document
-from haystack.components.rankers.hugging_face_tei import HuggingFaceTEIRanker, TruncationDirection
-from haystack.utils import Secret
-
-
-class TestHuggingFaceTEIRanker:
-    def test_init(self, del_hf_env_vars):
-        """Test initialization with default and custom parameters"""
-        # Default parameters
-        ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com")
-        assert ranker.url == "https://api.my-tei-service.com"
-        assert ranker.top_k == 10
-        assert ranker.timeout == 30
-        assert not ranker.token.resolve_value()
-        assert ranker.max_retries == 3
-        assert ranker.retry_status_codes is None
-
-        # Custom parameters
-        token = Secret.from_token("my_api_token")
-        ranker = HuggingFaceTEIRanker(
-            url="https://api.my-tei-service.com",
-            top_k=5,
-            timeout=60,
-            token=token,
-            max_retries=5,
-            retry_status_codes=[500, 502, 503],
-        )
-        assert ranker.url == "https://api.my-tei-service.com"
-        assert ranker.top_k == 5
-        assert ranker.timeout == 60
-        assert ranker.token == token
-        assert ranker.max_retries == 5
-        assert ranker.retry_status_codes == [500, 502, 503]
-
-    def test_to_dict(self, del_hf_env_vars):
-        """Test serialization to dict with Secret token"""
-        component = HuggingFaceTEIRanker(
-            url="https://api.my-tei-service.com", top_k=5, timeout=30, max_retries=4, retry_status_codes=[500, 502]
-        )
-        data = component.to_dict()
-
-        assert data["type"] == "haystack.components.rankers.hugging_face_tei.HuggingFaceTEIRanker"
-        assert data["init_parameters"]["url"] == "https://api.my-tei-service.com"
-        assert data["init_parameters"]["top_k"] == 5
-        assert data["init_parameters"]["timeout"] == 30
-        assert data["init_parameters"]["token"] == {
-            "env_vars": ["HF_API_TOKEN", "HF_TOKEN"],
-            "strict": False,
-            "type": "env_var",
-        }
-        assert data["init_parameters"]["max_retries"] == 4
-        assert data["init_parameters"]["retry_status_codes"] == [500, 502]
-
-    def test_from_dict(self, del_hf_env_vars):
-        """Test deserialization from dict with environment variable token"""
-        data = {
-            "type": "haystack.components.rankers.hugging_face_tei.HuggingFaceTEIRanker",
-            "init_parameters": {
-                "url": "https://api.my-tei-service.com",
-                "top_k": 5,
-                "timeout": 30,
-                "token": {"type": "env_var", "env_vars": ["HF_API_TOKEN", "HF_TOKEN"], "strict": False},
-                "max_retries": 4,
-                "retry_status_codes": [500, 502],
-            },
-        }
-
-        component = HuggingFaceTEIRanker.from_dict(data)
-
-        assert component.url == "https://api.my-tei-service.com"
-        assert component.top_k == 5
-        assert component.timeout == 30
-        assert component.max_retries == 4
-        assert component.retry_status_codes == [500, 502]
-
-    def test_empty_documents(self, del_hf_env_vars):
-        """Test that empty documents list returns empty result"""
-        ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com")
-        result = ranker.run(query="test query", documents=[])
-        assert result == {"documents": []}
-
-    @patch("haystack.components.rankers.hugging_face_tei.request_with_retry")
-    def test_run_with_mock(self, mock_request, del_hf_env_vars):
-        """Test run method with mocked API response"""
-        # Setup mock response
-        mock_response = MagicMock(spec=httpx.Response)
-        mock_response.json.return_value = [
-            {"index": 2, "score": 0.95},
-            {"index": 1, "score": 0.85},
-            {"index": 0, "score": 0.75},
-        ]
-        mock_request.return_value = mock_response
-
-        # Create ranker and test documents
-        token = Secret.from_token("test_token")
-        ranker = HuggingFaceTEIRanker(
-            url="https://api.my-tei-service.com",
-            top_k=3,
-            timeout=30,
-            token=token,
-            max_retries=4,
-            retry_status_codes=[500, 502],
-        )
-
-        docs = [Document(content="Document A"), Document(content="Document B"), Document(content="Document C")]
-
-        # Run the ranker
-        result = ranker.run(query="test query", documents=docs)
-
-        # Check that request_with_retry was called with correct parameters
-        mock_request.assert_called_once_with(
-            method="POST",
-            url="https://api.my-tei-service.com/rerank",
-            json={"query": "test query", "texts": ["Document A", "Document B", "Document C"], "raw_scores": False},
-            timeout=30,
-            headers={"Authorization": "Bearer test_token"},
-            attempts=4,
-            status_codes_to_retry=[500, 502],
-        )
-
-        # Check that documents are ranked correctly
-        assert len(result["documents"]) == 3
-        assert result["documents"][0].content == "Document C"
-        assert result["documents"][0].score == 0.95
-        assert result["documents"][1].content == "Document B"
-        assert result["documents"][1].score == 0.85
-        assert result["documents"][2].content == "Document A"
-        assert result["documents"][2].score == 0.75
-
-    @patch("haystack.components.rankers.hugging_face_tei.request_with_retry")
-    def test_run_with_truncation_direction(self, mock_request, del_hf_env_vars):
-        """Test run method with truncation direction parameter"""
-        # Setup mock response
-        mock_response = MagicMock(spec=httpx.Response)
-        mock_response.json.return_value = [{"index": 0, "score": 0.95}]
-        mock_request.return_value = mock_response
-
-        # Create ranker and test documents
-        token = Secret.from_token("test_token")
-        ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com", token=token)
-        docs = [Document(content="Document A")]
-
-        # Run the ranker with truncation direction
-        ranker.run(query="test query", documents=docs, truncation_direction=TruncationDirection.LEFT)
-
-        # Check that request includes truncation parameters
-        mock_request.assert_called_once_with(
-            method="POST",
-            url="https://api.my-tei-service.com/rerank",
-            json={
-                "query": "test query",
-                "texts": ["Document A"],
-                "raw_scores": False,
-                "truncate": True,
-                "truncation_direction": "Left",
-            },
-            timeout=30,
-            headers={"Authorization": "Bearer test_token"},
-            attempts=3,
-            status_codes_to_retry=None,
-        )
-
-    @patch("haystack.components.rankers.hugging_face_tei.request_with_retry")
-    def test_run_with_custom_top_k(self, mock_request, del_hf_env_vars):
-        """Test run method with custom top_k parameter"""
-        # Setup mock response with 5 documents
-        mock_response = MagicMock(spec=httpx.Response)
-        mock_response.json.return_value = [
-            {"index": 4, "score": 0.95},
-            {"index": 3, "score": 0.90},
-            {"index": 2, "score": 0.85},
-            {"index": 1, "score": 0.80},
-            {"index": 0, "score": 0.75},
-        ]
-        mock_request.return_value = mock_response
-
-        # Create ranker with top_k=3
-        ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com", top_k=3)
-
-        # Create 5 test documents
-        docs = [Document(content=f"Document {i}") for i in range(5)]
-
-        # Run the ranker
-        result = ranker.run(query="test query", documents=docs)
-
-        # Check that only top 3 documents are returned
-        assert len(result["documents"]) == 3
-        assert result["documents"][0].content == "Document 4"
-        assert result["documents"][1].content == "Document 3"
-        assert result["documents"][2].content == "Document 2"
-
-        # Test with run-time top_k override
-        result = ranker.run(query="test query", documents=docs, top_k=2)
-
-        # Check that only top 2 documents are returned
-        assert len(result["documents"]) == 2
-        assert result["documents"][0].content == "Document 4"
-        assert result["documents"][1].content == "Document 3"
-
-    @patch("haystack.components.rankers.hugging_face_tei.request_with_retry")
-    def test_run_deduplicates_documents(self, mock_request, del_hf_env_vars):
-        """Test that duplicate documents are removed before sending to the API."""
-        mock_response = MagicMock(spec=httpx.Response)
-        mock_response.json.return_value = [{"index": 1, "score": 0.9}, {"index": 0, "score": 0.2}]
-        mock_request.return_value = mock_response
-
-        ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com")
-        # Document with duplicate id and lower score should be dropped
-        docs = [
-            Document(id="duplicate", content="keep me", score=0.9),
-            Document(id="duplicate", content="drop me", score=0.1),
-            Document(id="unique", content="unique"),
-        ]
-
-        result = ranker.run(query="test query", documents=docs)
-
-        mock_request.assert_called_once_with(
-            method="POST",
-            url="https://api.my-tei-service.com/rerank",
-            json={"query": "test query", "texts": ["keep me", "unique"], "raw_scores": False},
-            timeout=30,
-            headers={},
-            attempts=3,
-            status_codes_to_retry=None,
-        )
-        assert len(result["documents"]) == 2
-        assert result["documents"][0].content == "unique"
-        assert result["documents"][1].content == "keep me"
-
-    @patch("haystack.components.rankers.hugging_face_tei.request_with_retry")
-    def test_error_handling(self, mock_request, del_hf_env_vars):
-        """Test error handling in the ranker"""
-        # Setup mock response with error
-        mock_response = MagicMock(spec=httpx.Response)
-        mock_response.json.return_value = {"error": "Some error occurred", "error_type": "TestError"}
-        mock_request.return_value = mock_response
-
-        # Create ranker and test documents
-        ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com")
-        docs = [Document(content="Document A")]
-
-        # Test that RuntimeError is raised with the correct message
-        with pytest.raises(
-            RuntimeError, match=r"HuggingFaceTEIRanker API call failed \(TestError\): Some error occurred"
-        ):
-            ranker.run(query="test query", documents=docs)
-
-        # Test unexpected response format
-        mock_response.json.return_value = {"unexpected": "format"}
-        with pytest.raises(TypeError, match="Unexpected response format from text-embeddings-inference rerank API"):
-            ranker.run(query="test query", documents=docs)
-
-    @pytest.mark.asyncio
-    @patch("haystack.components.rankers.hugging_face_tei.async_request_with_retry")
-    async def test_run_async_with_mock(self, mock_request, del_hf_env_vars):
-        """Test run_async method with mocked API response"""
-        # Setup mock response
-        mock_response = MagicMock(spec=httpx.Response)
-        mock_response.json.return_value = [
-            {"index": 2, "score": 0.95},
-            {"index": 1, "score": 0.85},
-            {"index": 0, "score": 0.75},
-        ]
-        mock_request.return_value = mock_response
-
-        # Create ranker and test documents
-        token = Secret.from_token("test_token")
-        ranker = HuggingFaceTEIRanker(
-            url="https://api.my-tei-service.com",
-            top_k=3,
-            timeout=30,
-            token=token,
-            max_retries=4,
-            retry_status_codes=[500, 502],
-        )
-
-        docs = [Document(content="Document A"), Document(content="Document B"), Document(content="Document C")]
-
-        # Run the ranker asynchronously
-        result = await ranker.run_async(query="test query", documents=docs)
-
-        # Check that async_request_with_retry was called with correct parameters
-        mock_request.assert_called_once_with(
-            method="POST",
-            url="https://api.my-tei-service.com/rerank",
-            json={"query": "test query", "texts": ["Document A", "Document B", "Document C"], "raw_scores": False},
-            timeout=30,
-            headers={"Authorization": "Bearer test_token"},
-            attempts=4,
-            status_codes_to_retry=[500, 502],
-        )
-
-        # Check that documents are ranked correctly
-        assert len(result["documents"]) == 3
-        assert result["documents"][0].content == "Document C"
-        assert result["documents"][0].score == 0.95
-        assert result["documents"][1].content == "Document B"
-        assert result["documents"][1].score == 0.85
-        assert result["documents"][2].content == "Document A"
-        assert result["documents"][2].score == 0.75
-
-    @pytest.mark.asyncio
-    @patch("haystack.components.rankers.hugging_face_tei.async_request_with_retry")
-    async def test_run_async_deduplicates_documents(self, mock_request, del_hf_env_vars):
-        """Test that duplicate documents are removed before sending to the API."""
-        mock_response = MagicMock(spec=httpx.Response)
-        mock_response.json.return_value = [{"index": 1, "score": 0.9}, {"index": 0, "score": 0.2}]
-        mock_request.return_value = mock_response
-
-        ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com")
-        # Document with duplicate id and lower score should be dropped
-        docs = [
-            Document(id="duplicate", content="keep me", score=0.9),
-            Document(id="duplicate", content="drop me", score=0.1),
-            Document(id="unique", content="unique"),
-        ]
-
-        result = await ranker.run_async(query="test query", documents=docs)
-
-        mock_request.assert_called_once_with(
-            method="POST",
-            url="https://api.my-tei-service.com/rerank",
-            json={"query": "test query", "texts": ["keep me", "unique"], "raw_scores": False},
-            timeout=30,
-            headers={},
-            attempts=3,
-            status_codes_to_retry=None,
-        )
-        assert len(result["documents"]) == 2
-        assert result["documents"][0].content == "unique"
-        assert result["documents"][1].content == "keep me"
-
-    @pytest.mark.asyncio
-    @patch("haystack.components.rankers.hugging_face_tei.async_request_with_retry")
-    async def test_run_async_empty_documents(self, mock_request, del_hf_env_vars):
-        """Test run_async with empty documents list"""
-        ranker = HuggingFaceTEIRanker(url="https://api.my-tei-service.com")
-        result = await ranker.run_async(query="test query", documents=[])
-
-        # Check that no API call was made
-        mock_request.assert_not_called()
-        assert result == {"documents": []}