Skip to content
2 changes: 2 additions & 0 deletions haystack/dataclasses/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"image_content": ["ImageContent"],
"file_content": ["FileContent"],
"document": ["Document"],
"skill_meta": ["SkillMeta"],
"sparse_embedding": ["SparseEmbedding"],
"state": ["State"],
"streaming_chunk": [
Expand Down Expand Up @@ -46,6 +47,7 @@
from .document import Document as Document
from .file_content import FileContent as FileContent
from .image_content import ImageContent as ImageContent
from .skill_meta import SkillMeta as SkillMeta
from .sparse_embedding import SparseEmbedding as SparseEmbedding
from .streaming_chunk import AsyncStreamingCallbackT as AsyncStreamingCallbackT
from .streaming_chunk import ComponentInfo as ComponentInfo
Expand Down
21 changes: 21 additions & 0 deletions haystack/dataclasses/skill_meta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from dataclasses import dataclass, field
from pathlib import Path


@dataclass
class SkillMeta:
"""
Metadata describing a single skill.

:param name: The skill's name, used by the agent to load it.
:param description: A short description of when to use the skill. Shown to the agent up front.
:param path: The skill's directory. Set by `FileSystemSkillStore`; can be `None` for other stores.
"""

name: str
description: str
path: Path | None = field(default=None)
3 changes: 3 additions & 0 deletions haystack/skill_stores/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
15 changes: 15 additions & 0 deletions haystack/skill_stores/file_system/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
Comment thread
tstadel marked this conversation as resolved.

import sys
from typing import TYPE_CHECKING

from lazy_imports import LazyImporter

_import_structure = {"skill_store": ["FileSystemSkillStore"]}

if TYPE_CHECKING:
from .skill_store import FileSystemSkillStore as FileSystemSkillStore
else:
sys.modules[__name__] = LazyImporter(name=__name__, module_file=__file__, import_structure=_import_structure)
136 changes: 136 additions & 0 deletions haystack/skill_stores/file_system/skill_store.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from pathlib import Path
from typing import Any

import yaml

from haystack.core.serialization import default_from_dict, default_to_dict
from haystack.dataclasses.skill_meta import SkillMeta
from haystack.skill_stores.types.protocol import SKILL_FILE_NAME


def _parse_frontmatter(text: str) -> tuple[dict[str, Any], str]:
"""
Split a `SKILL.md` file into its YAML frontmatter and markdown body.

The frontmatter is the YAML block delimited by leading and trailing `---` lines. If no frontmatter is
present, an empty mapping and the original text are returned.

:param text: The full contents of a `SKILL.md` file.
:returns: A tuple of (frontmatter mapping, body).
:raises ValueError: If the frontmatter is present but is not a valid YAML mapping.
"""
stripped = text.lstrip()
if not stripped.startswith("---"):
return {}, text

# Drop the leading '---' line, then split on the closing '---'.
after_open = stripped[len("---") :].lstrip("\n")
parts = after_open.split("\n---", 1)
if len(parts) != 2:
return {}, text

frontmatter_block, body = parts
loaded = yaml.safe_load(frontmatter_block) or {}
if not isinstance(loaded, dict):
raise ValueError("Skill frontmatter must be a YAML mapping.") # noqa: TRY004
return loaded, body.lstrip("\n")


class FileSystemSkillStore:
"""
SkillStore backed by a directory of skill sub-directories on the local filesystem.

Expected layout:

```
skills/
pdf-forms/
SKILL.md # frontmatter (name, description) + markdown instructions
reference/forms.md # optional bundled file
```

Only the frontmatter of each `SKILL.md` is read at construction time (cheap); bodies and bundled
files are read lazily when the agent calls the corresponding tool.

:param skills_dir: Root directory that contains one sub-directory per skill.
:raises ValueError: If `skills_dir` does not exist, is not a directory, a skill is missing a required
frontmatter field, or two skills share the same name.
"""

def __init__(self, skills_dir: str | Path) -> None:
self.skills_dir = Path(skills_dir)
self._skills = self._scan()

def _scan(self) -> dict[str, SkillMeta]:
if not self.skills_dir.is_dir():
raise ValueError(f"Skills directory '{self.skills_dir}' does not exist or is not a directory.")

skills: dict[str, SkillMeta] = {}
for skill_file in sorted(self.skills_dir.glob(f"*/{SKILL_FILE_NAME}")):
skill_dir = skill_file.parent
frontmatter, _ = _parse_frontmatter(skill_file.read_text(encoding="utf-8"))

name = frontmatter.get("name", skill_dir.name)
description = frontmatter.get("description")
if not description:
raise ValueError(f"Skill '{name}' ({skill_file}) is missing a 'description' in its frontmatter.")
if name in skills:
raise ValueError(f"Duplicate skill name '{name}' found in '{self.skills_dir}'.")

skills[name] = SkillMeta(name=name, description=description, path=skill_dir)
return skills

def list_skills(self) -> dict[str, SkillMeta]:
"""Lists all skills available on disk"""
return self._skills

def load_skill_body(self, name: str) -> str:
"""Loads the skill body from disk"""
meta = self._skills.get(name)
if meta is None:
raise KeyError(name)
if meta.path is None:
raise ValueError(f"Skill '{name}' is missing its directory path in metadata.")
_, body = _parse_frontmatter((meta.path / SKILL_FILE_NAME).read_text(encoding="utf-8"))
return body

def list_skill_files(self, name: str) -> list[str]:
"""List all files in a skill directory, excluding the SKILL.md file."""
meta = self._skills.get(name)
if meta is None:
raise KeyError(name)
if meta.path is None:
raise ValueError(f"Skill '{name}' is missing its directory path in metadata.")
return sorted(
p.relative_to(meta.path).as_posix()
for p in meta.path.rglob("*")
if p.is_file() and p.name != SKILL_FILE_NAME
)

def read_skill_file(self, name: str, path: str) -> str:
"""read_skill_file implementation that prevents path traversal outside the skill directory."""
meta = self._skills.get(name)
if meta is None:
raise KeyError(name)
if meta.path is None:
raise ValueError(f"Skill '{name}' is missing its directory path in metadata.")
skill_dir = meta.path.resolve()
target = (skill_dir / path).resolve()
if skill_dir != target and skill_dir not in target.parents:
raise PermissionError(f"path escapes the '{name}' skill directory")
if not target.is_file():
raise FileNotFoundError(f"File '{path}' not found in skill '{name}'")
return target.read_text(encoding="utf-8")

def to_dict(self) -> dict[str, Any]:
"""Serialize this store to a dictionary for use with :meth:`from_dict`."""
return default_to_dict(self, skills_dir=str(self.skills_dir))

@classmethod
def from_dict(cls, data: dict[str, Any]) -> "FileSystemSkillStore":
"""Deserialize a FileSystemSkillStore from its dictionary representation."""
return default_from_dict(cls, data)
7 changes: 7 additions & 0 deletions haystack/skill_stores/types/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0
Comment thread
tstadel marked this conversation as resolved.

from .protocol import SkillStore

__all__ = ["SkillStore"]
88 changes: 88 additions & 0 deletions haystack/skill_stores/types/protocol.py
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you make a version of this SkillStore using a db (like the one to be used in platform) to make sure this protocol definition works for at least two different backends?

Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <info@deepset.ai>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Protocol, runtime_checkable

from haystack.dataclasses.skill_meta import SkillMeta

SKILL_FILE_NAME = "SKILL.md"


@runtime_checkable
class SkillStore(Protocol):
"""
Protocol for a skill storage layer.

A `SkillStore` is responsible for discovering available skills and providing their content on demand.
Implement this class to back `haystack.tools.SkillToolset` with any storage system — a local
directory, a database, a remote API, or an in-memory fixture.

The three content methods (`load_skill_body`, `list_skill_files`,
`read_skill_file`) are called lazily at agent runtime, not at construction time, so
implementations may defer I/O until a skill is actually needed.
"""

def list_skills(self) -> dict[str, SkillMeta]:
"""
Discover and return all available skills.

Called once during `haystack.tools.SkillToolset` initialization to build the skills catalog.

:returns: Mapping of skill name to its metadata.
"""
...

Comment thread
tstadel marked this conversation as resolved.
def load_skill_body(self, name: str) -> str:
"""
Return the markdown body of the named skill's instructions.

:param name: Skill name as returned by `list_skills`.
:returns: The raw markdown body (frontmatter stripped).
:raises KeyError: If no skill with `name` exists.
"""
...

def list_skill_files(self, name: str) -> list[str]:
"""
Return the relative paths of any files bundled with the named skill.

:param name: Skill name as returned by `list_skills`.
:returns: Sorted list of POSIX-style paths relative to the skill root. Empty when there are no extras.
:raises KeyError: If no skill with `name` exists.
"""
...

def read_skill_file(self, name: str, path: str) -> str:
"""
Read a file bundled with the named skill.

:param name: Skill name as returned by `list_skills`.
:param path: Path of the file relative to the skill root (e.g. `"reference/forms.md"`).
:returns: The file's text content.
:raises KeyError: If no skill with `name` exists.
:raises PermissionError: If `path` escapes the skill's root (path-traversal attempt).
:raises FileNotFoundError: If the file does not exist within the skill.
"""
...

def to_dict(self) -> dict[str, Any]:
"""
Serialize this store to a dictionary for use with `from_dict`.

Override both this method and `from_dict` to make your custom store serializable with
`haystack.tools.SkillToolset`.
"""
...

@classmethod
def from_dict(cls, data: dict[str, Any]) -> "SkillStore":
"""
Deserialize a store from a dictionary produced by `to_dict`.

Override both this method and `to_dict` to make your custom store serializable with
`haystack.tools.SkillToolset`.

:param data: Dictionary as produced by `to_dict`.
"""
...
3 changes: 1 addition & 2 deletions haystack/tools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from haystack.tools.tool import Tool, _check_duplicate_tool_names
from haystack.tools.toolset import Toolset
from haystack.tools.searchable_toolset import SearchableToolset
from haystack.tools.skills import SkillMeta, SkillToolset
from haystack.tools.skills import SkillToolset
from haystack.tools.component_tool import ComponentTool
from haystack.tools.pipeline_tool import PipelineTool
from haystack.tools.serde_utils import deserialize_tools_or_toolset_inplace, serialize_tools_or_toolset
Expand All @@ -33,7 +33,6 @@
"serialize_tools_or_toolset",
"Tool",
"SearchableToolset",
"SkillMeta",
"SkillToolset",
"ToolsType",
"Toolset",
Expand Down
4 changes: 2 additions & 2 deletions haystack/tools/skills/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,6 @@
#
# SPDX-License-Identifier: Apache-2.0

from haystack.tools.skills.skill_toolset import SkillMeta, SkillToolset
from haystack.tools.skills.skill_toolset import SkillToolset

__all__ = ["SkillMeta", "SkillToolset"]
__all__ = ["SkillToolset"]
Loading
Loading