diff --git a/haystack/dataclasses/__init__.py b/haystack/dataclasses/__init__.py index 66d1c8ba7b..90fb55c0ca 100644 --- a/haystack/dataclasses/__init__.py +++ b/haystack/dataclasses/__init__.py @@ -15,6 +15,7 @@ "image_content": ["ImageContent"], "file_content": ["FileContent"], "document": ["Document"], + "skill_meta": ["SkillMeta"], "sparse_embedding": ["SparseEmbedding"], "state": ["State"], "streaming_chunk": [ @@ -46,6 +47,7 @@ from .document import Document as Document from .file_content import FileContent as FileContent from .image_content import ImageContent as ImageContent + from .skill_meta import SkillMeta as SkillMeta from .sparse_embedding import SparseEmbedding as SparseEmbedding from .streaming_chunk import AsyncStreamingCallbackT as AsyncStreamingCallbackT from .streaming_chunk import ComponentInfo as ComponentInfo diff --git a/haystack/dataclasses/skill_meta.py b/haystack/dataclasses/skill_meta.py new file mode 100644 index 0000000000..31dfda47d7 --- /dev/null +++ b/haystack/dataclasses/skill_meta.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from dataclasses import dataclass, field +from pathlib import Path + + +@dataclass +class SkillMeta: + """ + Metadata describing a single skill. + + :param name: The skill's name, used by the agent to load it. + :param description: A short description of when to use the skill. Shown to the agent up front. + :param path: The skill's directory. Set by `FileSystemSkillStore`; can be `None` for other stores. + """ + + name: str + description: str + path: Path | None = field(default=None) diff --git a/haystack/skill_stores/__init__.py b/haystack/skill_stores/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/haystack/skill_stores/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/haystack/skill_stores/file_system/__init__.py b/haystack/skill_stores/file_system/__init__.py new file mode 100644 index 0000000000..bb1fdd2fd8 --- /dev/null +++ b/haystack/skill_stores/file_system/__init__.py @@ -0,0 +1,15 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import sys +from typing import TYPE_CHECKING + +from lazy_imports import LazyImporter + +_import_structure = {"skill_store": ["FileSystemSkillStore"]} + +if TYPE_CHECKING: + from .skill_store import FileSystemSkillStore as FileSystemSkillStore +else: + sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure) diff --git a/haystack/skill_stores/file_system/skill_store.py b/haystack/skill_stores/file_system/skill_store.py new file mode 100644 index 0000000000..0cc711061f --- /dev/null +++ b/haystack/skill_stores/file_system/skill_store.py @@ -0,0 +1,136 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from pathlib import Path +from typing import Any + +import yaml + +from haystack.core.serialization import default_from_dict, default_to_dict +from haystack.dataclasses.skill_meta import SkillMeta +from haystack.skill_stores.types.protocol import SKILL_FILE_NAME + + +def _parse_frontmatter(text: str) -> tuple[dict[str, Any], str]: + """ + Split a `SKILL.md` file into its YAML frontmatter and markdown body. + + The frontmatter is the YAML block delimited by leading and trailing `---` lines. If no frontmatter is + present, an empty mapping and the original text are returned. + + :param text: The full contents of a `SKILL.md` file. + :returns: A tuple of (frontmatter mapping, body). + :raises ValueError: If the frontmatter is present but is not a valid YAML mapping. + """ + stripped = text.lstrip() + if not stripped.startswith("---"): + return {}, text + + # Drop the leading '---' line, then split on the closing '---'. + after_open = stripped[len("---") :].lstrip("\n") + parts = after_open.split("\n---", 1) + if len(parts) != 2: + return {}, text + + frontmatter_block, body = parts + loaded = yaml.safe_load(frontmatter_block) or {} + if not isinstance(loaded, dict): + raise ValueError("Skill frontmatter must be a YAML mapping.") # noqa: TRY004 + return loaded, body.lstrip("\n") + + +class FileSystemSkillStore: + """ + SkillStore backed by a directory of skill sub-directories on the local filesystem. + + Expected layout: + + ``` + skills/ + pdf-forms/ + SKILL.md # frontmatter (name, description) + markdown instructions + reference/forms.md # optional bundled file + ``` + + Only the frontmatter of each `SKILL.md` is read at construction time (cheap); bodies and bundled + files are read lazily when the agent calls the corresponding tool. + + :param skills_dir: Root directory that contains one sub-directory per skill. + :raises ValueError: If `skills_dir` does not exist, is not a directory, a skill is missing a required + frontmatter field, or two skills share the same name. + """ + + def __init__(self, skills_dir: str | Path) -> None: + self.skills_dir = Path(skills_dir) + self._skills = self._scan() + + def _scan(self) -> dict[str, SkillMeta]: + if not self.skills_dir.is_dir(): + raise ValueError(f"Skills directory '{self.skills_dir}' does not exist or is not a directory.") + + skills: dict[str, SkillMeta] = {} + for skill_file in sorted(self.skills_dir.glob(f"*/{SKILL_FILE_NAME}")): + skill_dir = skill_file.parent + frontmatter, _ = _parse_frontmatter(skill_file.read_text(encoding="utf-8")) + + name = frontmatter.get("name", skill_dir.name) + description = frontmatter.get("description") + if not description: + raise ValueError(f"Skill '{name}' ({skill_file}) is missing a 'description' in its frontmatter.") + if name in skills: + raise ValueError(f"Duplicate skill name '{name}' found in '{self.skills_dir}'.") + + skills[name] = SkillMeta(name=name, description=description, path=skill_dir) + return skills + + def list_skills(self) -> dict[str, SkillMeta]: + """Lists all skills available on disk""" + return self._skills + + def load_skill_body(self, name: str) -> str: + """Loads the skill body from disk""" + meta = self._skills.get(name) + if meta is None: + raise KeyError(name) + if meta.path is None: + raise ValueError(f"Skill '{name}' is missing its directory path in metadata.") + _, body = _parse_frontmatter((meta.path / SKILL_FILE_NAME).read_text(encoding="utf-8")) + return body + + def list_skill_files(self, name: str) -> list[str]: + """List all files in a skill directory, excluding the SKILL.md file.""" + meta = self._skills.get(name) + if meta is None: + raise KeyError(name) + if meta.path is None: + raise ValueError(f"Skill '{name}' is missing its directory path in metadata.") + return sorted( + p.relative_to(meta.path).as_posix() + for p in meta.path.rglob("*") + if p.is_file() and p.name != SKILL_FILE_NAME + ) + + def read_skill_file(self, name: str, path: str) -> str: + """read_skill_file implementation that prevents path traversal outside the skill directory.""" + meta = self._skills.get(name) + if meta is None: + raise KeyError(name) + if meta.path is None: + raise ValueError(f"Skill '{name}' is missing its directory path in metadata.") + skill_dir = meta.path.resolve() + target = (skill_dir / path).resolve() + if skill_dir != target and skill_dir not in target.parents: + raise PermissionError(f"path escapes the '{name}' skill directory") + if not target.is_file(): + raise FileNotFoundError(f"File '{path}' not found in skill '{name}'") + return target.read_text(encoding="utf-8") + + def to_dict(self) -> dict[str, Any]: + """Serialize this store to a dictionary for use with :meth:`from_dict`.""" + return default_to_dict(self, skills_dir=str(self.skills_dir)) + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "FileSystemSkillStore": + """Deserialize a FileSystemSkillStore from its dictionary representation.""" + return default_from_dict(cls, data) diff --git a/haystack/skill_stores/types/__init__.py b/haystack/skill_stores/types/__init__.py new file mode 100644 index 0000000000..9c2c1da8e7 --- /dev/null +++ b/haystack/skill_stores/types/__init__.py @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from .protocol import SkillStore + +__all__ = ["SkillStore"] diff --git a/haystack/skill_stores/types/protocol.py b/haystack/skill_stores/types/protocol.py new file mode 100644 index 0000000000..227d526c2b --- /dev/null +++ b/haystack/skill_stores/types/protocol.py @@ -0,0 +1,88 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Any, Protocol, runtime_checkable + +from haystack.dataclasses.skill_meta import SkillMeta + +SKILL_FILE_NAME = "SKILL.md" + + +@runtime_checkable +class SkillStore(Protocol): + """ + Protocol for a skill storage layer. + + A `SkillStore` is responsible for discovering available skills and providing their content on demand. + Implement this class to back `haystack.tools.SkillToolset` with any storage system — a local + directory, a database, a remote API, or an in-memory fixture. + + The three content methods (`load_skill_body`, `list_skill_files`, + `read_skill_file`) are called lazily at agent runtime, not at construction time, so + implementations may defer I/O until a skill is actually needed. + """ + + def list_skills(self) -> dict[str, SkillMeta]: + """ + Discover and return all available skills. + + Called once during `haystack.tools.SkillToolset` initialization to build the skills catalog. + + :returns: Mapping of skill name to its metadata. + """ + ... + + def load_skill_body(self, name: str) -> str: + """ + Return the markdown body of the named skill's instructions. + + :param name: Skill name as returned by `list_skills`. + :returns: The raw markdown body (frontmatter stripped). + :raises KeyError: If no skill with `name` exists. + """ + ... + + def list_skill_files(self, name: str) -> list[str]: + """ + Return the relative paths of any files bundled with the named skill. + + :param name: Skill name as returned by `list_skills`. + :returns: Sorted list of POSIX-style paths relative to the skill root. Empty when there are no extras. + :raises KeyError: If no skill with `name` exists. + """ + ... + + def read_skill_file(self, name: str, path: str) -> str: + """ + Read a file bundled with the named skill. + + :param name: Skill name as returned by `list_skills`. + :param path: Path of the file relative to the skill root (e.g. `"reference/forms.md"`). + :returns: The file's text content. + :raises KeyError: If no skill with `name` exists. + :raises PermissionError: If `path` escapes the skill's root (path-traversal attempt). + :raises FileNotFoundError: If the file does not exist within the skill. + """ + ... + + def to_dict(self) -> dict[str, Any]: + """ + Serialize this store to a dictionary for use with `from_dict`. + + Override both this method and `from_dict` to make your custom store serializable with + `haystack.tools.SkillToolset`. + """ + ... + + @classmethod + def from_dict(cls, data: dict[str, Any]) -> "SkillStore": + """ + Deserialize a store from a dictionary produced by `to_dict`. + + Override both this method and `to_dict` to make your custom store serializable with + `haystack.tools.SkillToolset`. + + :param data: Dictionary as produced by `to_dict`. + """ + ... diff --git a/haystack/tools/__init__.py b/haystack/tools/__init__.py index 827c484a99..37be582377 100644 --- a/haystack/tools/__init__.py +++ b/haystack/tools/__init__.py @@ -11,7 +11,7 @@ from haystack.tools.tool import Tool, _check_duplicate_tool_names from haystack.tools.toolset import Toolset from haystack.tools.searchable_toolset import SearchableToolset -from haystack.tools.skills import SkillMeta, SkillToolset +from haystack.tools.skills import SkillToolset from haystack.tools.component_tool import ComponentTool from haystack.tools.pipeline_tool import PipelineTool from haystack.tools.serde_utils import deserialize_tools_or_toolset_inplace, serialize_tools_or_toolset @@ -33,7 +33,6 @@ "serialize_tools_or_toolset", "Tool", "SearchableToolset", - "SkillMeta", "SkillToolset", "ToolsType", "Toolset", diff --git a/haystack/tools/skills/__init__.py b/haystack/tools/skills/__init__.py index a63816d545..131168ab29 100644 --- a/haystack/tools/skills/__init__.py +++ b/haystack/tools/skills/__init__.py @@ -2,6 +2,6 @@ # # SPDX-License-Identifier: Apache-2.0 -from haystack.tools.skills.skill_toolset import SkillMeta, SkillToolset +from haystack.tools.skills.skill_toolset import SkillToolset -__all__ = ["SkillMeta", "SkillToolset"] +__all__ = ["SkillToolset"] diff --git a/haystack/tools/skills/skill_toolset.py b/haystack/tools/skills/skill_toolset.py index ee17a2d8bb..3d9ae82a68 100644 --- a/haystack/tools/skills/skill_toolset.py +++ b/haystack/tools/skills/skill_toolset.py @@ -2,113 +2,64 @@ # # SPDX-License-Identifier: Apache-2.0 -from dataclasses import dataclass -from pathlib import Path from typing import Annotated, Any -import yaml - from haystack.core.serialization import generate_qualified_class_name +from haystack.dataclasses.skill_meta import SkillMeta +from haystack.skill_stores.types.protocol import SkillStore from haystack.tools.from_function import create_tool_from_function from haystack.tools.tool import Tool from haystack.tools.toolset import Toolset - -SKILL_FILE_NAME = "SKILL.md" - - -@dataclass -class SkillMeta: - """ - Metadata describing a single skill discovered on disk. - - :param name: The skill's name, used by the agent to load it. - :param description: A short description of when to use the skill. Shown to the agent up front. - :param path: The skill's directory. - """ - - name: str - description: str - path: Path - - -def _parse_frontmatter(text: str) -> tuple[dict[str, Any], str]: - """ - Split a `SKILL.md` file into its YAML frontmatter and markdown body. - - The frontmatter is the YAML block delimited by leading and trailing `---` lines. If no frontmatter is - present, an empty mapping and the original text are returned. - - :param text: The full contents of a `SKILL.md` file. - :returns: A tuple of (frontmatter mapping, body). - :raises ValueError: If the frontmatter is present but is not a valid YAML mapping. - """ - stripped = text.lstrip() - if not stripped.startswith("---"): - return {}, text - - # Drop the leading '---' line, then split on the closing '---'. - after_open = stripped[len("---") :].lstrip("\n") - parts = after_open.split("\n---", 1) - if len(parts) != 2: - return {}, text - - frontmatter_block, body = parts - loaded = yaml.safe_load(frontmatter_block) or {} - if not isinstance(loaded, dict): - raise ValueError("Skill frontmatter must be a YAML mapping.") # noqa: TRY004 - return loaded, body.lstrip("\n") +from haystack.utils.deserialization import deserialize_component_inplace class SkillToolset(Toolset): """ - A Toolset that lets an Agent discover and read filesystem "skills" via progressive disclosure. + A Toolset that lets an Agent discover and read skills via progressive disclosure. - A skill is a directory containing a `SKILL.md` file with YAML frontmatter (`name` and `description`) and a - markdown body of instructions. Skills may bundle additional files (reference docs, examples, templates). - This mirrors how Claude Code and Codex expose skills: + A skill is a directory (or equivalent storage unit) containing a `SKILL.md` file with YAML frontmatter + (`name` and `description`) and a markdown body of instructions. Skills may bundle additional files + (reference docs, examples, templates). This mirrors how Claude Code and Codex expose skills: - The name and description of every skill are injected into the Agent's system prompt (via `system_prompt_contribution`) so the model knows which skills exist. - `load_skill` returns a skill's full instructions on demand, plus a manifest of its bundled files. - `read_skill_file` reads a bundled file on demand. - Expected layout: - - ``` - skills/ - pdf-forms/ - SKILL.md # frontmatter (name, description) + markdown instructions - reference/forms.md - ``` - - ### Usage example + **Example usage:** ```python from haystack.components.agents import Agent from haystack.components.generators.chat import OpenAIChatGenerator from haystack.dataclasses import ChatMessage from haystack.tools import SkillToolset + from haystack.skill_stores import FileSystemSkillStore - skills = SkillToolset("skills/") + store = FileSystemSkillStore("skills/") + skills = SkillToolset(store) agent = Agent(chat_generator=OpenAIChatGenerator(), tools=skills) - # The skills catalog is appended to the system prompt automatically. result = agent.run(messages=[ChatMessage.from_user("Fill in this PDF form for me.")]) ``` + + Expected filesystem layout: + + ``` + skills/ + pdf-forms/ + SKILL.md # frontmatter (name, description) + markdown instructions + reference/forms.md + ``` """ - def __init__(self, skills_dir: str | Path) -> None: + def __init__(self, store: SkillStore) -> None: """ - Initialize the SkillToolset by scanning a directory for skills. - - Only the frontmatter of each `SKILL.md` is read at construction time (cheap); bodies and bundled files - are read lazily when the agent calls `load_skill` / `read_skill_file`. + Initialize the SkillToolset. - :param skills_dir: Directory containing one subdirectory per skill, each with a `SKILL.md`. - :raises ValueError: If `skills_dir` does not exist, is not a directory, a skill is missing a required - frontmatter field, or two skills share the same name. + :param store: A `haystack.skill_stores.SkillStore` instance to back this toolset. """ - self.skills_dir = Path(skills_dir) - self._skills: dict[str, SkillMeta] = self._scan() + self._store = store + + self._skills: dict[str, SkillMeta] = self._store.list_skills() super().__init__(tools=[self._create_load_skill_tool(), self._create_read_skill_file_tool()]) @property @@ -116,31 +67,6 @@ def skills(self) -> dict[str, SkillMeta]: """Mapping of skill name to its metadata.""" return self._skills - def _scan(self) -> dict[str, SkillMeta]: - """ - Scan `skills_dir` for skills, reading only the frontmatter of each `SKILL.md`. - - :returns: Mapping of skill name to metadata. - :raises ValueError: On a missing directory, missing required frontmatter, or duplicate skill names. - """ - if not self.skills_dir.is_dir(): - raise ValueError(f"Skills directory '{self.skills_dir}' does not exist or is not a directory.") - - skills: dict[str, SkillMeta] = {} - for skill_file in sorted(self.skills_dir.glob(f"*/{SKILL_FILE_NAME}")): - skill_dir = skill_file.parent - frontmatter, _ = _parse_frontmatter(skill_file.read_text(encoding="utf-8")) - - name = frontmatter.get("name", skill_dir.name) - description = frontmatter.get("description") - if not description: - raise ValueError(f"Skill '{name}' ({skill_file}) is missing a 'description' in its frontmatter.") - if name in skills: - raise ValueError(f"Duplicate skill name '{name}' found in '{self.skills_dir}'.") - - skills[name] = SkillMeta(name=name, description=description, path=skill_dir) - return skills - def system_prompt_contribution(self) -> str | None: """ Render the skills catalog and usage instructions for injection into the Agent's system prompt. @@ -165,22 +91,17 @@ def system_prompt_contribution(self) -> str | None: return "\n".join(lines) def _create_load_skill_tool(self) -> Tool: - """Create the `load_skill` tool, closed over this toolset's skill registry.""" + """Create the `load_skill` tool, closed over this toolset's store.""" def load_skill(name: Annotated[str, "Exact name of the skill to load, from the Available Skills list."]) -> str: """Load a skill's full instructions. Call this before doing a task the skill covers.""" - meta = self._skills.get(name) - if meta is None: + try: + body = self._store.load_skill_body(name) + bundled = self._store.list_skill_files(name) + except KeyError: available = ", ".join(self._skills) or "none" return f"Unknown skill '{name}'. Available skills: {available}." - _, body = _parse_frontmatter((meta.path / SKILL_FILE_NAME).read_text(encoding="utf-8")) - - bundled = sorted( - p.relative_to(meta.path).as_posix() - for p in meta.path.rglob("*") - if p.is_file() and p.name != SKILL_FILE_NAME - ) if bundled: manifest = "\n".join(f"- {path}" for path in bundled) body = f"{body}\n\n---\nBundled files (read with `read_skill_file`):\n{manifest}" @@ -189,25 +110,22 @@ def load_skill(name: Annotated[str, "Exact name of the skill to load, from the A return create_tool_from_function(function=load_skill, name="load_skill") def _create_read_skill_file_tool(self) -> Tool: - """Create the `read_skill_file` tool, closed over this toolset's skill registry.""" + """Create the `read_skill_file` tool, closed over this toolset's store.""" def read_skill_file( name: Annotated[str, "Name of the skill that owns the file."], path: Annotated[str, "Path of the file relative to the skill directory, e.g. 'reference/forms.md'."], ) -> str: """Read a file bundled with a skill (reference docs, examples, templates).""" - meta = self._skills.get(name) - if meta is None: + try: + return self._store.read_skill_file(name, path) + except KeyError: available = ", ".join(self._skills) or "none" return f"Unknown skill '{name}'. Available skills: {available}." - - skill_dir = meta.path.resolve() - target = (skill_dir / path).resolve() - if skill_dir != target and skill_dir not in target.parents: + except PermissionError: return f"Refusing to read '{path}': path escapes the '{name}' skill directory." - if not target.is_file(): + except FileNotFoundError: return f"File '{path}' not found in skill '{name}'." - return target.read_text(encoding="utf-8") return create_tool_from_function(function=read_skill_file, name="read_skill_file") @@ -215,18 +133,18 @@ def to_dict(self) -> dict[str, Any]: """ Serialize the toolset to a dictionary. - Only the skills directory is serialized; tools are rebuilt by rescanning on deserialization. - :returns: Dictionary representation of the toolset. """ - return {"type": generate_qualified_class_name(type(self)), "data": {"skills_dir": str(self.skills_dir)}} + return {"type": generate_qualified_class_name(type(self)), "data": {"store": self._store.to_dict()}} @classmethod def from_dict(cls, data: dict[str, Any]) -> "SkillToolset": """ Deserialize a toolset from a dictionary. - :param data: Dictionary representation of the toolset. + :param data: Dictionary representation of the toolset, as produced by `to_dict`. :returns: A new SkillToolset instance. """ - return cls(skills_dir=data["data"]["skills_dir"]) + inner_data = data["data"] + deserialize_component_inplace(inner_data, key="store") + return cls(**inner_data) diff --git a/test/skill_stores/__init__.py b/test/skill_stores/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/test/skill_stores/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/test/skill_stores/file_system/__init__.py b/test/skill_stores/file_system/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/test/skill_stores/file_system/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/test/skill_stores/file_system/test_skill_store.py b/test/skill_stores/file_system/test_skill_store.py new file mode 100644 index 0000000000..3b6969625b --- /dev/null +++ b/test/skill_stores/file_system/test_skill_store.py @@ -0,0 +1,116 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 + +import pytest + +from haystack.skill_stores.file_system.skill_store import FileSystemSkillStore, _parse_frontmatter +from haystack.skill_stores.types.protocol import SkillStore + + +def _write_skill(skills_dir, name, description=None, body="Instructions.", files=None): + skill_dir = skills_dir / name + skill_dir.mkdir(parents=True) + frontmatter = f"---\nname: {name}\n" + if description is not None: + frontmatter += f"description: {description}\n" + frontmatter += "---\n" + (skill_dir / "SKILL.md").write_text(frontmatter + body, encoding="utf-8") + for rel_path, content in (files or {}).items(): + target = skill_dir / rel_path + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text(content, encoding="utf-8") + return skill_dir + + +class TestParseFrontmatter: + def test_parses_frontmatter_and_body(self): + frontmatter, body = _parse_frontmatter("---\nname: a\ndescription: d\n---\nThe body.") + assert frontmatter == {"name": "a", "description": "d"} + assert body == "The body." + + def test_no_frontmatter_returns_empty_mapping(self): + frontmatter, body = _parse_frontmatter("Just a body, no frontmatter.") + assert frontmatter == {} + assert body == "Just a body, no frontmatter." + + def test_non_mapping_frontmatter_raises(self): + with pytest.raises(ValueError): + _parse_frontmatter("---\n- just\n- a\n- list\n---\nbody") + + +class TestFileSystemSkillStore: + def test_list_skills(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="Fill PDF forms.") + _write_skill(tmp_path, "excel", description="Edit spreadsheets.") + + store = FileSystemSkillStore(tmp_path) + skills = store.list_skills() + + assert set(skills) == {"pdf-forms", "excel"} + assert skills["pdf-forms"].description == "Fill PDF forms." + assert skills["pdf-forms"].path == tmp_path / "pdf-forms" + + def test_missing_directory_raises(self, tmp_path): + with pytest.raises(ValueError, match="does not exist"): + FileSystemSkillStore(tmp_path / "nope") + + def test_missing_description_raises(self, tmp_path): + _write_skill(tmp_path, "broken", description=None) + with pytest.raises(ValueError, match="missing a 'description'"): + FileSystemSkillStore(tmp_path) + + def test_load_skill_body(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="d", body="Step 1. Do the thing.") + store = FileSystemSkillStore(tmp_path) + assert store.load_skill_body("pdf-forms") == "Step 1. Do the thing." + + def test_load_skill_body_unknown_raises(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="d") + store = FileSystemSkillStore(tmp_path) + with pytest.raises(KeyError): + store.load_skill_body("nope") + + def test_list_skill_files(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="d", files={"reference/forms.md": "details"}) + store = FileSystemSkillStore(tmp_path) + assert store.list_skill_files("pdf-forms") == ["reference/forms.md"] + + def test_list_skill_files_empty(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="d") + store = FileSystemSkillStore(tmp_path) + assert store.list_skill_files("pdf-forms") == [] + + def test_list_skill_files_unknown_raises(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="d") + store = FileSystemSkillStore(tmp_path) + with pytest.raises(KeyError): + store.list_skill_files("nope") + + def test_read_skill_file(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="d", files={"reference/forms.md": "form details"}) + store = FileSystemSkillStore(tmp_path) + assert store.read_skill_file("pdf-forms", "reference/forms.md") == "form details" + + def test_read_skill_file_blocks_traversal(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="d") + (tmp_path / "secret.txt").write_text("top secret") + store = FileSystemSkillStore(tmp_path) + with pytest.raises(PermissionError, match="escapes"): + store.read_skill_file("pdf-forms", "../secret.txt") + + def test_read_skill_file_missing_raises(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="d") + store = FileSystemSkillStore(tmp_path) + with pytest.raises(FileNotFoundError, match="not found"): + store.read_skill_file("pdf-forms", "nope.md") + + def test_read_skill_file_unknown_skill_raises(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="d") + store = FileSystemSkillStore(tmp_path) + with pytest.raises(KeyError): + store.read_skill_file("nope", "anything.md") + + def test_is_skill_store(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="d") + assert isinstance(FileSystemSkillStore(tmp_path), SkillStore) diff --git a/test/tools/skills/__init__.py b/test/tools/skills/__init__.py new file mode 100644 index 0000000000..c1764a6e03 --- /dev/null +++ b/test/tools/skills/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2022-present deepset GmbH +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/test/tools/skills/test_skill_toolset.py b/test/tools/skills/test_skill_toolset.py index 093e86ec04..ff331b233b 100644 --- a/test/tools/skills/test_skill_toolset.py +++ b/test/tools/skills/test_skill_toolset.py @@ -2,10 +2,39 @@ # # SPDX-License-Identifier: Apache-2.0 -import pytest +from haystack.core.serialization import generate_qualified_class_name +from haystack.dataclasses.skill_meta import SkillMeta +from haystack.skill_stores.file_system.skill_store import FileSystemSkillStore from haystack.tools import SkillToolset -from haystack.tools.skills.skill_toolset import _parse_frontmatter + + +class _SerializableStore: + """Module-level custom store used to test round-trip serialization.""" + + def __init__(self, skills: dict[str, str]) -> None: + self._data = skills + + def list_skills(self) -> dict[str, SkillMeta]: + return {name: SkillMeta(name=name, description=desc) for name, desc in self._data.items()} + + def load_skill_body(self, name: str) -> str: + if name not in self._data: + raise KeyError(name) + return f"Instructions for {name}." + + def list_skill_files(self, name: str) -> list[str]: + return [] + + def read_skill_file(self, name: str, path: str) -> str: + raise FileNotFoundError + + def to_dict(self) -> dict: + return {"type": generate_qualified_class_name(type(self)), "init_parameters": {"skills": self._data}} + + @classmethod + def from_dict(cls, data: dict) -> "_SerializableStore": + return cls(skills=data["init_parameters"]["skills"]) def _write_skill(skills_dir, name, description=None, body="Instructions.", files=None): @@ -23,51 +52,34 @@ def _write_skill(skills_dir, name, description=None, body="Instructions.", files return skill_dir -class TestParseFrontmatter: - def test_parses_frontmatter_and_body(self): - frontmatter, body = _parse_frontmatter("---\nname: a\ndescription: d\n---\nThe body.") - assert frontmatter == {"name": "a", "description": "d"} - assert body == "The body." - - def test_no_frontmatter_returns_empty_mapping(self): - frontmatter, body = _parse_frontmatter("Just a body, no frontmatter.") - assert frontmatter == {} - assert body == "Just a body, no frontmatter." - - def test_non_mapping_frontmatter_raises(self): - with pytest.raises(ValueError): - _parse_frontmatter("---\n- just\n- a\n- list\n---\nbody") - - class TestSkillToolset: def test_scans_skills(self, tmp_path): _write_skill(tmp_path, "pdf-forms", description="Use to fill PDF forms.") _write_skill(tmp_path, "excel", description="Use to edit spreadsheets.") - toolset = SkillToolset(tmp_path) + toolset = SkillToolset(FileSystemSkillStore(tmp_path)) assert set(toolset.skills) == {"pdf-forms", "excel"} assert toolset.skills["pdf-forms"].description == "Use to fill PDF forms." assert {t.name for t in toolset} == {"load_skill", "read_skill_file"} - def test_missing_directory_raises(self, tmp_path): - with pytest.raises(ValueError, match="does not exist"): - SkillToolset(tmp_path / "nope") - - def test_missing_description_raises(self, tmp_path): - _write_skill(tmp_path, "broken", description=None) - with pytest.raises(ValueError, match="missing a 'description'"): - SkillToolset(tmp_path) + def test_accepts_skill_store_instance(self, tmp_path): + _write_skill(tmp_path, "pdf-forms", description="Use to fill PDF forms.") + store = FileSystemSkillStore(tmp_path) + toolset = SkillToolset(store) + assert set(toolset.skills) == {"pdf-forms"} + assert toolset._store is store def test_system_prompt_contribution_lists_skills(self, tmp_path): _write_skill(tmp_path, "pdf-forms", description="Use to fill PDF forms.") - contribution = SkillToolset(tmp_path).system_prompt_contribution() + contribution = SkillToolset(FileSystemSkillStore(tmp_path)).system_prompt_contribution() + assert contribution is not None assert "## Available Skills" in contribution assert "**pdf-forms**: Use to fill PDF forms." in contribution assert "load_skill" in contribution and "read_skill_file" in contribution def test_system_prompt_contribution_none_when_empty(self, tmp_path): - assert SkillToolset(tmp_path).system_prompt_contribution() is None + assert SkillToolset(FileSystemSkillStore(tmp_path)).system_prompt_contribution() is None def test_load_skill_returns_body_and_manifest(self, tmp_path): _write_skill( @@ -77,43 +89,86 @@ def test_load_skill_returns_body_and_manifest(self, tmp_path): body="Step 1. Do the thing.", files={"reference/forms.md": "details"}, ) - load_skill = next(t for t in SkillToolset(tmp_path) if t.name == "load_skill") + load_skill = next(t for t in SkillToolset(FileSystemSkillStore(tmp_path)) if t.name == "load_skill") result = load_skill.invoke(name="pdf-forms") assert "Step 1. Do the thing." in result assert "reference/forms.md" in result def test_load_skill_unknown(self, tmp_path): _write_skill(tmp_path, "pdf-forms", description="Use to fill PDF forms.") - load_skill = next(t for t in SkillToolset(tmp_path) if t.name == "load_skill") + load_skill = next(t for t in SkillToolset(FileSystemSkillStore(tmp_path)) if t.name == "load_skill") assert "Unknown skill 'nope'" in load_skill.invoke(name="nope") def test_read_skill_file(self, tmp_path): _write_skill(tmp_path, "pdf-forms", description="d", files={"reference/forms.md": "form details"}) - read = next(t for t in SkillToolset(tmp_path) if t.name == "read_skill_file") + read = next(t for t in SkillToolset(FileSystemSkillStore(tmp_path)) if t.name == "read_skill_file") assert read.invoke(name="pdf-forms", path="reference/forms.md") == "form details" def test_read_skill_file_blocks_traversal(self, tmp_path): _write_skill(tmp_path, "pdf-forms", description="d") (tmp_path / "secret.txt").write_text("top secret") - read = next(t for t in SkillToolset(tmp_path) if t.name == "read_skill_file") + read = next(t for t in SkillToolset(FileSystemSkillStore(tmp_path)) if t.name == "read_skill_file") result = read.invoke(name="pdf-forms", path="../secret.txt") assert "escapes" in result assert "top secret" not in result def test_read_skill_file_missing(self, tmp_path): _write_skill(tmp_path, "pdf-forms", description="d") - read = next(t for t in SkillToolset(tmp_path) if t.name == "read_skill_file") + read = next(t for t in SkillToolset(FileSystemSkillStore(tmp_path)) if t.name == "read_skill_file") assert "not found" in read.invoke(name="pdf-forms", path="nope.md") def test_to_dict_and_from_dict(self, tmp_path): _write_skill(tmp_path, "pdf-forms", description="Use to fill PDF forms.") - toolset = SkillToolset(tmp_path) + toolset = SkillToolset(FileSystemSkillStore(tmp_path)) data = toolset.to_dict() assert data == { "type": "haystack.tools.skills.skill_toolset.SkillToolset", - "data": {"skills_dir": str(tmp_path)}, + "data": { + "store": { + "type": "haystack.skill_stores.file_system.skill_store.FileSystemSkillStore", + "init_parameters": {"skills_dir": str(tmp_path)}, + } + }, } restored = SkillToolset.from_dict(data) assert set(restored.skills) == {"pdf-forms"} + + def test_to_dict_and_from_dict_with_custom_serializable_store(self): + store = _SerializableStore(skills={"demo": "A demo skill."}) + toolset = SkillToolset(store) + + serialized = toolset.to_dict() + assert serialized["data"]["store"]["init_parameters"]["skills"] == {"demo": "A demo skill."} + + restored = SkillToolset.from_dict(serialized) + assert set(restored.skills) == {"demo"} + + def test_load_skill_via_custom_store(self, tmp_path): + class _InMemoryStore: + def list_skills(self): + return {"demo": SkillMeta(name="demo", description="A demo skill.")} + + def load_skill_body(self, name): + if name != "demo": + raise KeyError(name) + return "Do the demo thing." + + def list_skill_files(self, name): + return [] + + def read_skill_file(self, name, path): + raise FileNotFoundError + + def to_dict(self): + raise NotImplementedError + + @classmethod + def from_dict(cls, data): + raise NotImplementedError + + toolset = SkillToolset(_InMemoryStore()) + load_skill = next(t for t in toolset if t.name == "load_skill") + assert load_skill.invoke(name="demo") == "Do the demo thing." + assert "Unknown skill 'nope'" in load_skill.invoke(name="nope")