AstrBotDevs · kawayiYokami · May 29, 2026 · gemini-code-assist · May 29, 2026 · gemini-code-assist
diff --git a/astrbot/core/tools/computer_tools/fs.py b/astrbot/core/tools/computer_tools/fs.py
@@ -38,6 +38,8 @@
 from dataclasses import dataclass, field
 from pathlib import Path
 
+import mcp.types
+
 from astrbot.api import FunctionTool, logger
 from astrbot.api.event import MessageChain
 from astrbot.core.agent.run_context import ContextWrapper
@@ -215,6 +217,66 @@ def _decode_escaped_text(value: str) -> str:
     )
 
 
+def _provider_supports_image(context: ContextWrapper[AstrAgentContext]) -> bool:
+    """Check if the current provider supports image modality."""
+    try:
+        umo = context.context.event.unified_msg_origin
+        provider = context.context.context.get_using_provider(umo=umo)
+        if provider is None:
+            return True  # Cannot determine, assume supported
+        modalities = provider.provider_config.get("modalities", [])
+        return "image" in modalities
+    except Exception:
+        return True  # Cannot determine, assume supported
+
+
+async def _caption_image_fallback(
+    context: ContextWrapper[AstrAgentContext],
+    image_path: str,
+) -> ToolExecResult:
+    """Try to caption an image using the configured image caption provider.
+
+    Returns the caption text or an error message if no caption provider is available.
+    """
+    from astrbot.core.provider.provider import Provider
+
+    umo = context.context.event.unified_msg_origin
+    cfg = context.context.context.get_config(umo=umo)
+    provider_settings = cfg.get("provider_settings", {})
+    caption_provider_id = provider_settings.get("default_image_caption_provider_id", "")
+
+    if not caption_provider_id:
+        return (
+            "Error: your provider does not support image modality, "
+            "and no image caption provider is configured. Unable to read image file."
+        )
+
+    caption_provider = context.context.context.get_provider_by_id(caption_provider_id)
+    if caption_provider is None or not isinstance(caption_provider, Provider):
+        return (
+            "Error: your provider does not support image modality, "
+            f"and the configured image caption provider `{caption_provider_id}` is not available. "
+            "Unable to read image file."
+        )
+
+    caption_prompt = provider_settings.get(
+        "image_caption_prompt", "Please describe the image."
+    )
+
+    try:
+        llm_resp = await caption_provider.text_chat(
+            prompt=caption_prompt,
+            image_urls=[image_path],
+        )
+        caption = (llm_resp.completion_text or "").strip()
+        if not caption:
+            return "Error: image caption provider returned an empty description."
+        return f"[Image description]: {caption}"
+    except Exception as exc:
+        logger.error(f"Image captioning failed: {exc}")
+        return f"Error: failed to generate image description: {exc}"
+
+
 @builtin_tool(config=_COMPUTER_RUNTIME_TOOL_CONFIG)
 @dataclass
 class FileReadTool(FunctionTool):
@@ -281,7 +343,7 @@ async def call(
                 context.context.context,
                 context.context.event.unified_msg_origin,
             )
-            return await read_file_tool_result(
+            result = await read_file_tool_result(
                 sb,
                 local_mode=local_env,
                 path=normalized_path,
@@ -293,6 +355,20 @@ async def call(
                     else None
                 ),
             )
+
+            # If the result is an image and the provider doesn't support image modality,
+            # fall back to image captioning or return an error.
+            if (
+                isinstance(result, mcp.types.CallToolResult)
+                and result.content
+                and any(
+                    isinstance(item, mcp.types.ImageContent) for item in result.content
+                )
+                and not _provider_supports_image(context)
+            ):
+                return await _caption_image_fallback(context, normalized_path)
+
+            return result
-            if (
-                isinstance(result, mcp.types.CallToolResult)
-                and result.content
-                and any(
-                    isinstance(item, mcp.types.ImageContent) for item in result.content
-                )
-                and not _provider_supports_image(context)
-            ):
-                return await _caption_image_fallback(context, normalized_path)
-
-            return result
+            if (
+                isinstance(result, mcp.types.CallToolResult)
+                and result.content
+                and any(
+                    isinstance(item, mcp.types.ImageContent) for item in result.content
+                )
+                and not _provider_supports_image(context)
+            ):
+                image_item = next(
+                    item for item in result.content if isinstance(item, mcp.types.ImageContent)
+                )
+                image_url = f"data:{image_item.mimeType};base64,{image_item.data}"
+                return await _caption_image_fallback(context, image_url)
+
+            return result
-            if (
-                isinstance(result, mcp.types.CallToolResult)
-                and result.content
-                and any(
-                    isinstance(item, mcp.types.ImageContent) for item in result.content
-                )
-                and not _provider_supports_image(context)
-            ):
-                return await _caption_image_fallback(context, normalized_path)
-
-            return result
+            if (
+                isinstance(result, mcp.types.CallToolResult)
+                and result.content
+                and any(
+                    isinstance(item, mcp.types.ImageContent) for item in result.content
+                )
+                and not _provider_supports_image(context)
+            ):
+                image_item = next(
+                    item for item in result.content if isinstance(item, mcp.types.ImageContent)
+                )
+                image_url = f"data:{image_item.mimeType};base64,{image_item.data}"
+                return await _caption_image_fallback(context, image_url)
+
+            return result
         except PermissionError as exc:
             return f"Error: {exc}"
         except Exception as exc: