Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 77 additions & 1 deletion astrbot/core/tools/computer_tools/fs.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@
from dataclasses import dataclass, field
from pathlib import Path

import mcp.types

from astrbot.api import FunctionTool, logger
from astrbot.api.event import MessageChain
from astrbot.core.agent.run_context import ContextWrapper
Expand Down Expand Up @@ -215,6 +217,66 @@ def _decode_escaped_text(value: str) -> str:
)


def _provider_supports_image(context: ContextWrapper[AstrAgentContext]) -> bool:
"""Check if the current provider supports image modality."""
try:
umo = context.context.event.unified_msg_origin
provider = context.context.context.get_using_provider(umo=umo)
if provider is None:
return True # Cannot determine, assume supported
modalities = provider.provider_config.get("modalities", [])
return "image" in modalities
except Exception:
return True # Cannot determine, assume supported


async def _caption_image_fallback(
context: ContextWrapper[AstrAgentContext],
image_path: str,
) -> ToolExecResult:
"""Try to caption an image using the configured image caption provider.

Returns the caption text or an error message if no caption provider is available.
"""
from astrbot.core.provider.provider import Provider

umo = context.context.event.unified_msg_origin
cfg = context.context.context.get_config(umo=umo)
provider_settings = cfg.get("provider_settings", {})
caption_provider_id = provider_settings.get("default_image_caption_provider_id", "")

if not caption_provider_id:
return (
"Error: your provider does not support image modality, "
"and no image caption provider is configured. Unable to read image file."
)

caption_provider = context.context.context.get_provider_by_id(caption_provider_id)
if caption_provider is None or not isinstance(caption_provider, Provider):
return (
"Error: your provider does not support image modality, "
f"and the configured image caption provider `{caption_provider_id}` is not available. "
"Unable to read image file."
)

caption_prompt = provider_settings.get(
"image_caption_prompt", "Please describe the image."
)

try:
llm_resp = await caption_provider.text_chat(
prompt=caption_prompt,
image_urls=[image_path],
)
caption = (llm_resp.completion_text or "").strip()
if not caption:
return "Error: image caption provider returned an empty description."
return f"[Image description]: {caption}"
except Exception as exc:
logger.error(f"Image captioning failed: {exc}")
return f"Error: failed to generate image description: {exc}"
Comment on lines +233 to +277
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

In sandbox mode (local_env = False), the image file resides inside the sandbox container/environment, while the host running astrbot executes the provider's text_chat. Passing image_path (which is a sandbox path) directly to the provider will fail because the host cannot access the sandbox filesystem directly.

Since read_file_tool_result already reads and compresses the image into base64 format (returned as mcp.types.ImageContent), we should pass the base64 data URI (e.g., data:{mimeType};base64,{data}) to _caption_image_fallback instead of the file path. This avoids re-reading the file and works seamlessly in both local and sandbox environments.

async def _caption_image_fallback(
    context: ContextWrapper[AstrAgentContext],
    image_url: str,
) -> ToolExecResult:
    """Try to caption an image using the configured image caption provider.

    Returns the caption text or an error message if no caption provider is available.
    """
    from astrbot.core.provider.provider import Provider

    umo = context.context.event.unified_msg_origin
    cfg = context.context.context.get_config(umo=umo)
    provider_settings = cfg.get("provider_settings", {})
    caption_provider_id = provider_settings.get("default_image_caption_provider_id", "")

    if not caption_provider_id:
        return (
            "Error: your provider does not support image modality, "
            "and no image caption provider is configured. Unable to read image file."
        )

    caption_provider = context.context.context.get_provider_by_id(caption_provider_id)
    if caption_provider is None or not isinstance(caption_provider, Provider):
        return (
            "Error: your provider does not support image modality, "
            f"and the configured image caption provider `{caption_provider_id}` is not available. "
            "Unable to read image file."
        )

    caption_prompt = provider_settings.get(
        "image_caption_prompt", "Please describe the image."
    )

    try:
        llm_resp = await caption_provider.text_chat(
            prompt=caption_prompt,
            image_urls=[image_url],
        )
        caption = (llm_resp.completion_text or "").strip()
        if not caption:
            return "Error: image caption provider returned an empty description."
        return f"[Image description]: {caption}"
    except Exception as exc:
        logger.error(f"Image captioning failed: {exc}")
        return f"Error: failed to generate image description: {exc}"



@builtin_tool(config=_COMPUTER_RUNTIME_TOOL_CONFIG)
@dataclass
class FileReadTool(FunctionTool):
Expand Down Expand Up @@ -281,7 +343,7 @@ async def call(
context.context.context,
context.context.event.unified_msg_origin,
)
return await read_file_tool_result(
result = await read_file_tool_result(
sb,
local_mode=local_env,
path=normalized_path,
Expand All @@ -293,6 +355,20 @@ async def call(
else None
),
)

# If the result is an image and the provider doesn't support image modality,
# fall back to image captioning or return an error.
if (
isinstance(result, mcp.types.CallToolResult)
and result.content
and any(
isinstance(item, mcp.types.ImageContent) for item in result.content
)
and not _provider_supports_image(context)
):
return await _caption_image_fallback(context, normalized_path)

return result
Comment on lines +361 to +371
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

high

Update the fallback call to extract the base64 image data from the CallToolResult and pass it as a data URI to _caption_image_fallback. This ensures compatibility with sandbox environments where the host cannot directly access the local file path. Additionally, please ensure this new attachment handling functionality is accompanied by corresponding unit tests.

Suggested change
if (
isinstance(result, mcp.types.CallToolResult)
and result.content
and any(
isinstance(item, mcp.types.ImageContent) for item in result.content
)
and not _provider_supports_image(context)
):
return await _caption_image_fallback(context, normalized_path)
return result
if (
isinstance(result, mcp.types.CallToolResult)
and result.content
and any(
isinstance(item, mcp.types.ImageContent) for item in result.content
)
and not _provider_supports_image(context)
):
image_item = next(
item for item in result.content if isinstance(item, mcp.types.ImageContent)
)
image_url = f"data:{image_item.mimeType};base64,{image_item.data}"
return await _caption_image_fallback(context, image_url)
return result
References
  1. New functionality, such as handling attachments, should be accompanied by corresponding unit tests.

except PermissionError as exc:
return f"Error: {exc}"
except Exception as exc:
Expand Down
Loading