diff --git a/benchmarks/harbor/__init__.py b/benchmarks/harbor/__init__.py
new file mode 100644
index 00000000..d61e4690
--- /dev/null
+++ b/benchmarks/harbor/__init__.py
@@ -0,0 +1 @@
+"""Generic Harbor-backed benchmark runner."""
diff --git a/benchmarks/harbor/eval_infer.py b/benchmarks/harbor/eval_infer.py
new file mode 100644
index 00000000..fe4c3347
--- /dev/null
+++ b/benchmarks/harbor/eval_infer.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+"""Generic Harbor evaluation report generator."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+from typing import Any
+
+from benchmarks.utils.laminar import LaminarService
+from benchmarks.utils.report_costs import generate_cost_report
+from openhands.sdk import get_logger
+
+
+logger = get_logger(__name__)
+
+
+def _metric(
+    data: dict[str, Any], test_result: dict[str, Any], key: str, default: Any
+) -> Any:
+    metrics = data.get("metrics", {})
+    value = metrics.get(key)
+    if value is not None:
+        return value
+    return test_result.get("final_metrics", {}).get(key, default)
+
+
+def process_harbor_results(input_file: str, output_file: str) -> dict[str, Any]:
+    completed_ids: set[str] = set()
+    resolved_ids: set[str] = set()
+    unresolved_ids: set[str] = set()
+    incomplete_ids: set[str] = set()
+    error_ids: set[str] = set()
+
+    total_cost_usd = 0.0
+    total_prompt_tokens = 0
+    total_completion_tokens = 0
+
+    with open(input_file, encoding="utf-8") as infile:
+        for line_num, line in enumerate(infile, 1):
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                data = json.loads(line)
+            except json.JSONDecodeError as exc:
+                logger.error("Line %d: invalid JSON: %s", line_num, exc)
+                continue
+
+            instance_id = data.get("instance_id")
+            if not instance_id:
+                logger.warning("Line %d: missing instance_id", line_num)
+                continue
+            if instance_id in completed_ids or instance_id in incomplete_ids:
+                logger.warning(
+                    "Line %d: duplicate instance_id %s", line_num, instance_id
+                )
+                continue
+
+            if data.get("error"):
+                error_ids.add(instance_id)
+                incomplete_ids.add(instance_id)
+                continue
+
+            test_result = data.get("test_result", {})
+            completed_ids.add(instance_id)
+            if test_result.get("passed") is True:
+                resolved_ids.add(instance_id)
+            else:
+                unresolved_ids.add(instance_id)
+
+            total_cost_usd += float(
+                _metric(data, test_result, "total_cost_usd", 0.0) or 0.0
+            )
+            total_prompt_tokens += int(
+                _metric(data, test_result, "total_prompt_tokens", 0) or 0
+            )
+            total_completion_tokens += int(
+                _metric(data, test_result, "total_completion_tokens", 0) or 0
+            )
+
+    error_path = Path(input_file).with_name(f"{Path(input_file).stem}_errors.jsonl")
+    if error_path.exists():
+        with open(error_path, encoding="utf-8") as error_file:
+            for line in error_file:
+                try:
+                    data = json.loads(line)
+                except json.JSONDecodeError:
+                    continue
+                instance_id = data.get("instance_id")
+                if instance_id and instance_id not in completed_ids:
+                    incomplete_ids.add(instance_id)
+                    error_ids.add(instance_id)
+
+    submitted_ids = completed_ids | incomplete_ids
+    report: dict[str, Any] = {
+        "total_instances": len(submitted_ids),
+        "submitted_instances": len(submitted_ids),
+        "completed_instances": len(completed_ids),
+        "incomplete_instances": len(incomplete_ids),
+        "resolved_instances": len(resolved_ids),
+        "unresolved_instances": len(unresolved_ids),
+        "error_instances": len(error_ids),
+        "submitted_ids": sorted(submitted_ids),
+        "completed_ids": sorted(completed_ids),
+        "incomplete_ids": sorted(incomplete_ids),
+        "resolved_ids": sorted(resolved_ids),
+        "unresolved_ids": sorted(unresolved_ids),
+        "error_ids": sorted(error_ids),
+        "aggregate_metrics": {
+            "total_cost_usd": total_cost_usd,
+            "total_prompt_tokens": total_prompt_tokens,
+            "total_completion_tokens": total_completion_tokens,
+        },
+    }
+
+    with open(output_file, "w", encoding="utf-8") as outfile:
+        json.dump(report, outfile, indent=4)
+
+    logger.info("Harbor report generated at %s", output_file)
+    logger.info(
+        "Resolved %d/%d completed instances", len(resolved_ids), len(completed_ids)
+    )
+    return report
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Process generic Harbor output.jsonl")
+    parser.add_argument("input_file", help="Path to Harbor-converted output.jsonl")
+    parser.add_argument("--output-file", help="Output report JSON path")
+    args = parser.parse_args()
+
+    input_file = Path(args.input_file)
+    if not input_file.exists():
+        logger.error("Input file does not exist: %s", input_file)
+        sys.exit(1)
+
+    output_file = (
+        Path(args.output_file)
+        if args.output_file
+        else input_file.with_suffix(".report.json")
+    )
+    try:
+        process_harbor_results(str(input_file), str(output_file))
+        generate_cost_report(str(input_file))
+    except Exception as exc:
+        logger.error("Harbor evaluation failed: %s", exc)
+        sys.exit(1)
+
+    try:
+        LaminarService.get().update_evaluation_scores(str(input_file), str(output_file))
+    except Exception as exc:
+        logger.warning("Laminar telemetry reporting failed (non-fatal): %s", exc)
+
+    print(json.dumps({"report_json": str(output_file)}))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/harbor/run_infer.py b/benchmarks/harbor/run_infer.py
new file mode 100644
index 00000000..bef91c42
--- /dev/null
+++ b/benchmarks/harbor/run_infer.py
@@ -0,0 +1,353 @@
+#!/usr/bin/env python3
+"""Run any Harbor dataset/config/path with the OpenHands SDK agent."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shlex
+import shutil
+import subprocess
+import sys
+import tempfile
+from datetime import datetime, timezone
+from pathlib import Path
+
+from benchmarks.utils.evaluation_utils import construct_eval_output_dir
+from benchmarks.utils.harbor import (
+    _secret_value,
+    check_harbor_installed,
+    convert_harbor_to_eval_output,
+)
+from benchmarks.utils.report_costs import generate_cost_report
+from openhands.sdk import LLM, get_logger
+
+
+logger = get_logger(__name__)
+OUTPUT_FILENAME = "output.jsonl"
+DEFAULT_ADAPTER_REPO = "https://github.com/harbor-framework/harbor.git"
+
+
+def _load_task_ids(filepath: str) -> list[str]:
+    task_ids: list[str] = []
+    with open(filepath, encoding="utf-8") as f:
+        for line in f:
+            value = line.strip()
+            if value and not value.startswith("#"):
+                task_ids.append(value)
+    return task_ids
+
+
+def _checkout_adapter(repo: str, ref: str | None) -> tuple[Path, str]:
+    """Clone the adapter repo and return (checkout_dir, resolved_commit_sha).
+
+    The resolved SHA is captured so metadata can record exactly which commit
+    was evaluated, making runs reproducible even when ``ref`` is unset.
+    """
+    if not ref:
+        logger.warning(
+            "Cloning adapter repo without a pinned ref; results may not be "
+            "reproducible. Pass --harbor-adapter-ref to pin a tag/SHA/branch."
+        )
+    checkout_dir = Path(tempfile.mkdtemp(prefix="harbor-adapter-"))
+    cmd = ["git", "clone", "--depth", "1"]
+    if ref:
+        cmd.extend(["--branch", ref])
+    cmd.extend([repo, str(checkout_dir)])
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0 and ref:
+        logger.warning("Shallow clone by ref failed; retrying full fetch checkout")
+        shutil.rmtree(checkout_dir, ignore_errors=True)
+        checkout_dir = Path(tempfile.mkdtemp(prefix="harbor-adapter-"))
+        result = subprocess.run(
+            ["git", "clone", repo, str(checkout_dir)], capture_output=True, text=True
+        )
+        if result.returncode == 0:
+            result = subprocess.run(
+                ["git", "checkout", ref],
+                cwd=checkout_dir,
+                capture_output=True,
+                text=True,
+            )
+    if result.returncode != 0:
+        raise RuntimeError(f"Failed to checkout Harbor adapter repo: {result.stderr}")
+    sha_result = subprocess.run(
+        ["git", "rev-parse", "HEAD"], cwd=checkout_dir, capture_output=True, text=True
+    )
+    resolved_sha = (
+        sha_result.stdout.strip() if sha_result.returncode == 0 else "unknown"
+    )
+    return checkout_dir, resolved_sha
+
+
+def _resolve_target(
+    args: argparse.Namespace,
+) -> tuple[str, str, str | None, str | None]:
+    checkout_dir: Path | None = None
+    adapter_sha: str | None = None
+    target = args.harbor_target
+    target_type = args.harbor_target_type
+
+    if args.harbor_adapter_repo or args.harbor_adapter_path:
+        repo = args.harbor_adapter_repo or DEFAULT_ADAPTER_REPO
+        checkout_dir, adapter_sha = _checkout_adapter(repo, args.harbor_adapter_ref)
+        if args.harbor_adapter_path:
+            target_path = checkout_dir / args.harbor_adapter_path
+            if not target_path.exists():
+                raise RuntimeError(f"Harbor adapter path does not exist: {target_path}")
+            target = str(target_path)
+            if target_type == "auto":
+                target_type = (
+                    "config" if target_path.suffix in {".yaml", ".yml"} else "path"
+                )
+
+    if not target:
+        raise RuntimeError("A Harbor target or adapter path is required")
+
+    if target_type == "auto":
+        path = Path(target)
+        if path.exists():
+            target_type = "config" if path.suffix in {".yaml", ".yml"} else "path"
+        else:
+            target_type = "dataset"
+
+    return (
+        target,
+        target_type,
+        str(checkout_dir) if checkout_dir else None,
+        adapter_sha,
+    )
+
+
+SECRET_KEY_PATTERNS = ("KEY", "TOKEN", "SECRET", "PASSWORD", "CREDENTIAL", "PASSPHRASE")
+SENSITIVE_VALUE_FLAGS = ("--ae", "--ak")
+
+
+def _is_sensitive_value(prev: str, part: str) -> bool:
+    """Return True if ``part`` is a value following ``--ae``/``--ak`` whose key looks secret."""
+    if prev not in SENSITIVE_VALUE_FLAGS:
+        return False
+    key = part.split("=", 1)[0].upper()
+    return any(pat in key for pat in SECRET_KEY_PATTERNS)
+
+
+def _target_args(target: str, target_type: str) -> list[str]:
+    if target_type == "dataset":
+        return ["-d", target]
+    if target_type == "config":
+        return ["-c", target]
+    if target_type == "path":
+        return ["-p", target]
+    raise ValueError(f"Unsupported Harbor target type: {target_type}")
+
+
+def _parse_key_value(values: list[str]) -> list[str]:
+    for value in values:
+        if "=" not in value:
+            raise ValueError(f"Expected KEY=VALUE, got {value!r}")
+    return values
+
+
+def _split_json_values(raw: str | None) -> list[str]:
+    if not raw:
+        return []
+    data = json.loads(raw)
+    if isinstance(data, dict):
+        return [f"{key}={value}" for key, value in data.items()]
+    if isinstance(data, list) and all(isinstance(item, str) for item in data):
+        return data
+    raise ValueError("Expected a JSON object or list of KEY=VALUE strings")
+
+
+def run_harbor(
+    args: argparse.Namespace,
+    llm: LLM,
+    output_dir: str,
+    target: str,
+    target_type: str,
+    checkout_dir: str | None = None,
+    adapter_sha: str | None = None,
+) -> Path:
+    harbor_output_dir = Path(output_dir) / "harbor_output"
+    harbor_output_dir.mkdir(parents=True, exist_ok=True)
+
+    cmd = [
+        args.harbor_executable,
+        "run",
+        *_target_args(target, target_type),
+        "-a",
+        args.harbor_agent,
+        "-m",
+        llm.model,
+        "--jobs-dir",
+        str(harbor_output_dir.resolve()),
+        "--n-concurrent",
+        str(args.num_workers),
+    ]
+
+    if llm.api_key:
+        cmd.extend(["--ae", f"LLM_API_KEY={_secret_value(llm.api_key)}"])
+    if llm.base_url:
+        cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"])
+    for env_value in _parse_key_value(
+        [*args.agent_env, *_split_json_values(args.agent_env_json)]
+    ):
+        cmd.extend(["--ae", env_value])
+    for kwarg_value in _parse_key_value(
+        [*args.agent_kwarg, *_split_json_values(args.agent_kwarg_json)]
+    ):
+        cmd.extend(["--ak", kwarg_value])
+    for task_id in args.task_id or []:
+        task_value = task_id.rsplit("/", 1)[-1] if target_type == "path" else task_id
+        cmd.extend([args.task_filter_flag, task_value])
+    if args.n_limit is not None:
+        cmd.extend(["--n-tasks", str(args.n_limit)])
+    for extra_arg in args.harbor_arg:
+        cmd.extend(shlex.split(extra_arg))
+
+    safe_cmd = [
+        "***" if _is_sensitive_value(prev, part) else part
+        for prev, part in zip([""] + cmd, cmd)
+    ]
+    logger.info("Running Harbor command: %s", " ".join(safe_cmd))
+    if checkout_dir:
+        logger.info("Using Harbor adapter checkout: %s", checkout_dir)
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        logger.error("Harbor stdout: %s", result.stdout)
+        logger.error("Harbor stderr: %s", result.stderr)
+        raise RuntimeError(
+            f"Harbor run failed with exit code {result.returncode}: {result.stderr}"
+        )
+    logger.info("Harbor stdout: %s", result.stdout)
+    return harbor_output_dir
+
+
+def _parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Run a generic Harbor evaluation with OpenHands SDK"
+    )
+    parser.add_argument("llm_config_path")
+    parser.add_argument(
+        "--harbor-target", help="Harbor dataset name, config path, or dataset path"
+    )
+    parser.add_argument(
+        "--harbor-target-type",
+        choices=["auto", "dataset", "config", "path"],
+        default="auto",
+    )
+    parser.add_argument(
+        "--harbor-adapter-repo", help="Git repository containing the Harbor adapter"
+    )
+    parser.add_argument("--harbor-adapter-ref", help="Git ref/SHA/tag for adapter repo")
+    parser.add_argument(
+        "--harbor-adapter-path",
+        help="Path inside adapter repo to a Harbor YAML config or dataset directory",
+    )
+    parser.add_argument("--harbor-agent", default="openhands-sdk")
+    parser.add_argument("--harbor-executable", default="harbor")
+    parser.add_argument("--task-filter-flag", default="--include-task-name")
+    parser.add_argument(
+        "--agent-env", action="append", default=[], help="KEY=VALUE passed as --ae"
+    )
+    parser.add_argument("--agent-env-json", help="JSON object passed as repeated --ae")
+    parser.add_argument(
+        "--agent-kwarg", action="append", default=[], help="KEY=VALUE passed as --ak"
+    )
+    parser.add_argument(
+        "--agent-kwarg-json", help="JSON object passed as repeated --ak"
+    )
+    parser.add_argument(
+        "--harbor-arg", action="append", default=[], help="Additional raw Harbor args"
+    )
+    parser.add_argument("--benchmark-slug", default="harbor")
+    parser.add_argument("--output-dir", default="./evaluation_outputs")
+    parser.add_argument("--num-workers", type=int, default=1)
+    parser.add_argument("--n-limit", type=int)
+    parser.add_argument("--select", help="Text file containing task IDs")
+    parser.add_argument("--task-id", action="append")
+    parser.add_argument("--note")
+    parser.add_argument("--skip-harbor", action="store_true")
+    return parser
+
+
+def main() -> None:
+    args = _parser().parse_args()
+
+    if not os.path.isfile(args.llm_config_path):
+        logger.error("LLM config file does not exist: %s", args.llm_config_path)
+        sys.exit(1)
+    with open(args.llm_config_path, encoding="utf-8") as f:
+        llm = LLM.model_validate_json(f.read())
+
+    if args.select:
+        args.task_id = [*(args.task_id or []), *_load_task_ids(args.select)]
+
+    if not args.skip_harbor and not check_harbor_installed(args.harbor_executable):
+        logger.error("Harbor CLI is not installed; install with `pip install harbor`.")
+        sys.exit(1)
+
+    structured_output_dir = construct_eval_output_dir(
+        base_dir=args.output_dir,
+        dataset_name=args.benchmark_slug,
+        model_name=llm.model,
+        # Standard iteration cap used by all benchmark runners in this repo
+        max_iterations=100,
+        eval_note=args.note,
+    )
+    os.makedirs(structured_output_dir, exist_ok=True)
+
+    target, target_type, checkout_dir, adapter_sha = _resolve_target(args)
+    metadata = {
+        "llm": llm.model_dump_json(),
+        "benchmark": args.benchmark_slug,
+        "harbor_target": target,
+        "harbor_target_type": target_type,
+        "harbor_adapter_repo": args.harbor_adapter_repo,
+        "harbor_adapter_ref": args.harbor_adapter_ref,
+        "harbor_adapter_resolved_sha": adapter_sha,
+        "harbor_adapter_path": args.harbor_adapter_path,
+        "harbor_adapter_checkout": checkout_dir,
+        "harbor_agent": args.harbor_agent,
+        "timestamp": datetime.now(timezone.utc).isoformat(),
+        "note": args.note,
+    }
+    with open(
+        Path(structured_output_dir) / "metadata.json", "w", encoding="utf-8"
+    ) as f:
+        json.dump(metadata, f, indent=2)
+
+    output_path = Path(structured_output_dir) / OUTPUT_FILENAME
+    try:
+        harbor_output_dir = (
+            Path(structured_output_dir) / "harbor_output"
+            if args.skip_harbor
+            else run_harbor(
+                args,
+                llm,
+                structured_output_dir,
+                target,
+                target_type,
+                checkout_dir,
+                adapter_sha,
+            )
+        )
+        convert_harbor_to_eval_output(
+            harbor_output_dir=harbor_output_dir, eval_output_path=output_path
+        )
+    except Exception as exc:
+        logger.error("Harbor inference failed: %s", exc)
+        sys.exit(1)
+    finally:
+        if checkout_dir:
+            shutil.rmtree(checkout_dir, ignore_errors=True)
+
+    if output_path.exists():
+        generate_cost_report(str(output_path))
+    print(json.dumps({"output_json": str(output_path)}))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pyproject.toml b/pyproject.toml
index c29e07cf..ce3d0319 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -54,6 +54,8 @@ dependencies = [
 
 [project.scripts]
 validate-cfg = "benchmarks.scripts.validate_cfg:main"
+harbor-infer = "benchmarks.harbor.run_infer:main"
+harbor-eval = "benchmarks.harbor.eval_infer:main"
 swebench-infer = "benchmarks.swebench.run_infer:main"
 swtbench-infer = "benchmarks.swtbench.run_infer:main"
 swebench-eval = "benchmarks.swebench.eval_infer:main"
diff --git a/tests/test_harbor_run_infer.py b/tests/test_harbor_run_infer.py
new file mode 100644
index 00000000..7d2d4167
--- /dev/null
+++ b/tests/test_harbor_run_infer.py
@@ -0,0 +1,199 @@
+"""Tests for the generic Harbor run_infer helpers."""
+
+from __future__ import annotations
+
+import argparse
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+from benchmarks.harbor.run_infer import (
+    _is_sensitive_value,
+    _load_task_ids,
+    _parse_key_value,
+    _resolve_target,
+    _split_json_values,
+    _target_args,
+)
+
+
+def test_load_task_ids_strips_and_ignores_comments(tmp_path: Path) -> None:
+    f = tmp_path / "tasks.txt"
+    f.write_text("# comment\n  task_a  \n\ntask_b\n", encoding="utf-8")
+    assert _load_task_ids(str(f)) == ["task_a", "task_b"]
+
+
+def test_target_args_dataset() -> None:
+    assert _target_args("foo", "dataset") == ["-d", "foo"]
+
+
+def test_target_args_config() -> None:
+    assert _target_args("foo.yaml", "config") == ["-c", "foo.yaml"]
+
+
+def test_target_args_path() -> None:
+    assert _target_args("some/path", "path") == ["-p", "some/path"]
+
+
+def test_target_args_invalid() -> None:
+    with pytest.raises(ValueError, match="Unsupported Harbor target type"):
+        _target_args("foo", "bogus")
+
+
+def test_parse_key_value_ok() -> None:
+    assert _parse_key_value(["A=1", "B=2"]) == ["A=1", "B=2"]
+
+
+def test_parse_key_value_missing_equals() -> None:
+    with pytest.raises(ValueError, match="Expected KEY=VALUE"):
+        _parse_key_value(["bad"])
+
+
+def test_split_json_values_none() -> None:
+    assert _split_json_values(None) == []
+
+
+def test_split_json_values_dict() -> None:
+    assert _split_json_values('{"A": "1", "B": "2"}') == ["A=1", "B=2"]
+
+
+def test_split_json_values_list() -> None:
+    assert _split_json_values('["A=1", "B=2"]') == ["A=1", "B=2"]
+
+
+def test_split_json_values_invalid() -> None:
+    with pytest.raises(ValueError, match="Expected a JSON object or list"):
+        _split_json_values('"not-an-object-or-list"')
+
+
+def _make_args(**kwargs: object) -> argparse.Namespace:
+    defaults: dict[str, object] = dict(
+        harbor_target=None,
+        harbor_target_type="auto",
+        harbor_adapter_repo=None,
+        harbor_adapter_ref=None,
+        harbor_adapter_path=None,
+    )
+    defaults.update(kwargs.items())
+    return argparse.Namespace(**defaults)
+
+
+def test_resolve_target_requires_target() -> None:
+    with pytest.raises(
+        RuntimeError, match="A Harbor target or adapter path is required"
+    ):
+        _resolve_target(_make_args())
+
+
+def test_resolve_target_dataset_auto() -> None:
+    # When target doesn't exist on disk and type is auto, defaults to dataset
+    target, target_type, checkout, sha = _resolve_target(
+        _make_args(harbor_target="my-dataset")
+    )
+    assert target == "my-dataset"
+    assert target_type == "dataset"
+    assert checkout is None
+    assert sha is None
+
+
+def test_resolve_target_config_auto(tmp_path: Path) -> None:
+    cfg = tmp_path / "config.yaml"
+    cfg.write_text("foo: bar", encoding="utf-8")
+    target, target_type, checkout, sha = _resolve_target(
+        _make_args(harbor_target=str(cfg))
+    )
+    assert target == str(cfg)
+    assert target_type == "config"
+    assert checkout is None
+    assert sha is None
+
+
+def test_resolve_target_path_auto(tmp_path: Path) -> None:
+    p = tmp_path / "some_dir"
+    p.mkdir()
+    target, target_type, checkout, sha = _resolve_target(
+        _make_args(harbor_target=str(p))
+    )
+    assert target == str(p)
+    assert target_type == "path"
+    assert checkout is None
+    assert sha is None
+
+
+def test_resolve_target_explicit_type() -> None:
+    target, target_type, checkout, sha = _resolve_target(
+        _make_args(harbor_target="ds", harbor_target_type="dataset")
+    )
+    assert target == "ds"
+    assert target_type == "dataset"
+    assert checkout is None
+    assert sha is None
+
+
+def test_resolve_target_adapter_path(tmp_path: Path) -> None:
+    """Adapter path resolution clones the repo and resolves the target inside it."""
+    fake_checkout = tmp_path / "fake-clone"
+    fake_checkout.mkdir()
+    cfg_inside = fake_checkout / "adapter.yaml"
+    cfg_inside.write_text("foo: bar", encoding="utf-8")
+
+    with patch(
+        "benchmarks.harbor.run_infer._checkout_adapter",
+        return_value=(fake_checkout, "abc123def456"),
+    ):
+        target, target_type, checkout, sha = _resolve_target(
+            _make_args(
+                harbor_adapter_repo="https://example.com/repo.git",
+                harbor_adapter_path="adapter.yaml",
+            )
+        )
+
+    assert target == str(cfg_inside)
+    assert target_type == "config"
+    assert checkout == str(fake_checkout)
+    assert sha == "abc123def456"
+
+
+# --- Secret masking tests ---
+
+
+def test_is_sensitive_value_key_suffix() -> None:
+    assert _is_sensitive_value("--ae", "LLM_API_KEY=secret123")
+
+
+def test_is_sensitive_value_token() -> None:
+    assert _is_sensitive_value("--ae", "GITHUB_TOKEN=ghp_xxx")
+
+
+def test_is_sensitive_value_secret() -> None:
+    assert _is_sensitive_value("--ae", "HF_SECRET=hf_xxx")
+
+
+def test_is_sensitive_value_password() -> None:
+    assert _is_sensitive_value("--ae", "DB_PASSWORD=p455w0rd")
+
+
+def test_is_sensitive_value_ak_token() -> None:
+    """--ak values with secret-like keys must also be masked (regression test)."""
+    assert _is_sensitive_value("--ak", "GITHUB_TOKEN=ghp_xxx")
+
+
+def test_is_sensitive_value_ak_key_suffix() -> None:
+    assert _is_sensitive_value("--ak", "API_KEY=secret123")
+
+
+def test_is_sensitive_value_ak_non_secret() -> None:
+    assert not _is_sensitive_value("--ak", "MAX_ITERATIONS=100")
+
+
+def test_is_sensitive_value_unknown_flag() -> None:
+    assert not _is_sensitive_value("--foo", "GITHUB_TOKEN=ghp_xxx")
+
+
+def test_is_sensitive_value_non_secret() -> None:
+    assert not _is_sensitive_value("--ae", "LLM_BASE_URL=https://api.example.com")
+
+
+def test_is_sensitive_value_non_secret_env() -> None:
+    assert not _is_sensitive_value("--ae", "MAX_ITERATIONS=100")