From a9265143f562d8025543acb2a88a30b26a8db399 Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Mon, 22 Jun 2026 10:08:30 -0400 Subject: [PATCH 1/8] Add generic Harbor benchmark runner Co-authored-by: openhands --- benchmarks/harbor/__init__.py | 1 + benchmarks/harbor/eval_infer.py | 155 +++++++++++++++++ benchmarks/harbor/run_infer.py | 296 ++++++++++++++++++++++++++++++++ pyproject.toml | 2 + 4 files changed, 454 insertions(+) create mode 100644 benchmarks/harbor/__init__.py create mode 100644 benchmarks/harbor/eval_infer.py create mode 100644 benchmarks/harbor/run_infer.py diff --git a/benchmarks/harbor/__init__.py b/benchmarks/harbor/__init__.py new file mode 100644 index 000000000..d61e4690b --- /dev/null +++ b/benchmarks/harbor/__init__.py @@ -0,0 +1 @@ +"""Generic Harbor-backed benchmark runner.""" diff --git a/benchmarks/harbor/eval_infer.py b/benchmarks/harbor/eval_infer.py new file mode 100644 index 000000000..d92f44d58 --- /dev/null +++ b/benchmarks/harbor/eval_infer.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 +"""Generic Harbor evaluation report generator.""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path +from typing import Any + +from benchmarks.utils.laminar import LaminarService +from benchmarks.utils.report_costs import generate_cost_report +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + + +def _metric( + data: dict[str, Any], test_result: dict[str, Any], key: str, default: Any +) -> Any: + metrics = data.get("metrics", {}) + value = metrics.get(key) + if value is not None: + return value + return test_result.get("final_metrics", {}).get(key, default) + + +def process_harbor_results(input_file: str, output_file: str) -> dict[str, Any]: + completed_ids: set[str] = set() + resolved_ids: set[str] = set() + unresolved_ids: set[str] = set() + incomplete_ids: set[str] = set() + error_ids: set[str] = set() + + total_cost_usd = 0.0 + total_prompt_tokens = 0 + total_completion_tokens = 0 + + with open(input_file, encoding="utf-8") as infile: + for line_num, line in enumerate(infile, 1): + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + except json.JSONDecodeError as exc: + logger.error("Line %d: invalid JSON: %s", line_num, exc) + continue + + instance_id = data.get("instance_id") + if not instance_id: + logger.warning("Line %d: missing instance_id", line_num) + continue + if instance_id in completed_ids or instance_id in incomplete_ids: + logger.warning( + "Line %d: duplicate instance_id %s", line_num, instance_id + ) + continue + + if data.get("error"): + error_ids.add(instance_id) + incomplete_ids.add(instance_id) + continue + + test_result = data.get("test_result", {}) + completed_ids.add(instance_id) + if test_result.get("passed") is True: + resolved_ids.add(instance_id) + else: + unresolved_ids.add(instance_id) + + total_cost_usd += float( + _metric(data, test_result, "total_cost_usd", 0.0) or 0.0 + ) + total_prompt_tokens += int( + _metric(data, test_result, "total_prompt_tokens", 0) or 0 + ) + total_completion_tokens += int( + _metric(data, test_result, "total_completion_tokens", 0) or 0 + ) + + error_path = Path(input_file).with_name(f"{Path(input_file).stem}_errors.jsonl") + if error_path.exists(): + with open(error_path, encoding="utf-8") as error_file: + for line in error_file: + try: + data = json.loads(line) + except json.JSONDecodeError: + continue + instance_id = data.get("instance_id") + if instance_id and instance_id not in completed_ids: + incomplete_ids.add(instance_id) + error_ids.add(instance_id) + + submitted_ids = completed_ids | incomplete_ids + report: dict[str, Any] = { + "total_instances": len(submitted_ids), + "submitted_instances": len(submitted_ids), + "completed_instances": len(completed_ids), + "incomplete_instances": len(incomplete_ids), + "resolved_instances": len(resolved_ids), + "unresolved_instances": len(unresolved_ids), + "error_instances": len(error_ids), + "submitted_ids": sorted(submitted_ids), + "completed_ids": sorted(completed_ids), + "incomplete_ids": sorted(incomplete_ids), + "resolved_ids": sorted(resolved_ids), + "unresolved_ids": sorted(unresolved_ids), + "error_ids": sorted(error_ids), + "aggregate_metrics": { + "total_cost_usd": total_cost_usd, + "total_prompt_tokens": total_prompt_tokens, + "total_completion_tokens": total_completion_tokens, + }, + } + + with open(output_file, "w", encoding="utf-8") as outfile: + json.dump(report, outfile, indent=4) + + logger.info("Harbor report generated at %s", output_file) + logger.info( + "Resolved %d/%d completed instances", len(resolved_ids), len(completed_ids) + ) + return report + + +def main() -> None: + parser = argparse.ArgumentParser(description="Process generic Harbor output.jsonl") + parser.add_argument("input_file", help="Path to Harbor-converted output.jsonl") + parser.add_argument("--output-file", help="Output report JSON path") + args = parser.parse_args() + + input_file = Path(args.input_file) + if not input_file.exists(): + logger.error("Input file does not exist: %s", input_file) + sys.exit(1) + + output_file = ( + Path(args.output_file) if args.output_file else input_file.with_suffix(".report.json") + ) + try: + report = process_harbor_results(str(input_file), str(output_file)) + generate_cost_report(str(input_file)) + LaminarService.send_eval_report(report, str(input_file)) + except Exception as exc: + logger.error("Harbor evaluation failed: %s", exc) + sys.exit(1) + + print(json.dumps({"report_json": str(output_file)})) + + +if __name__ == "__main__": + main() diff --git a/benchmarks/harbor/run_infer.py b/benchmarks/harbor/run_infer.py new file mode 100644 index 000000000..2f537733e --- /dev/null +++ b/benchmarks/harbor/run_infer.py @@ -0,0 +1,296 @@ +#!/usr/bin/env python3 +"""Run any Harbor dataset/config/path with the OpenHands SDK agent.""" + +from __future__ import annotations + +import argparse +import json +import os +import shlex +import shutil +import subprocess +import sys +import tempfile +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from benchmarks.utils.evaluation_utils import construct_eval_output_dir +from benchmarks.utils.harbor import check_harbor_installed, convert_harbor_to_eval_output +from benchmarks.utils.report_costs import generate_cost_report +from openhands.sdk import LLM, get_logger + + +logger = get_logger(__name__) +OUTPUT_FILENAME = "output.jsonl" +DEFAULT_ADAPTER_REPO = "https://github.com/harbor-framework/harbor.git" + + +def _load_task_ids(filepath: str) -> list[str]: + task_ids: list[str] = [] + with open(filepath, encoding="utf-8") as f: + for line in f: + value = line.strip() + if value and not value.startswith("#"): + task_ids.append(value) + return task_ids + + +def _secret_value(value: object) -> str: + if hasattr(value, "get_secret_value"): + return value.get_secret_value() # type: ignore[no-any-return, attr-defined] + return str(value) + + +def _checkout_adapter(repo: str, ref: str | None) -> Path: + checkout_dir = Path(tempfile.mkdtemp(prefix="harbor-adapter-")) + cmd = ["git", "clone", "--depth", "1"] + if ref: + cmd.extend(["--branch", ref]) + cmd.extend([repo, str(checkout_dir)]) + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0 and ref: + logger.warning("Shallow clone by ref failed; retrying full fetch checkout") + shutil.rmtree(checkout_dir, ignore_errors=True) + checkout_dir = Path(tempfile.mkdtemp(prefix="harbor-adapter-")) + result = subprocess.run( + ["git", "clone", repo, str(checkout_dir)], capture_output=True, text=True + ) + if result.returncode == 0: + result = subprocess.run( + ["git", "checkout", ref], + cwd=checkout_dir, + capture_output=True, + text=True, + ) + if result.returncode != 0: + raise RuntimeError(f"Failed to checkout Harbor adapter repo: {result.stderr}") + return checkout_dir + + +def _resolve_target(args: argparse.Namespace) -> tuple[str, str, str | None]: + checkout_dir: Path | None = None + target = args.harbor_target + target_type = args.harbor_target_type + + if args.harbor_adapter_repo or args.harbor_adapter_path: + repo = args.harbor_adapter_repo or DEFAULT_ADAPTER_REPO + checkout_dir = _checkout_adapter(repo, args.harbor_adapter_ref) + if args.harbor_adapter_path: + target_path = checkout_dir / args.harbor_adapter_path + if not target_path.exists(): + raise RuntimeError(f"Harbor adapter path does not exist: {target_path}") + target = str(target_path) + if target_type == "auto": + target_type = ( + "config" if target_path.suffix in {".yaml", ".yml"} else "path" + ) + + if not target: + raise RuntimeError("A Harbor target or adapter path is required") + + if target_type == "auto": + path = Path(target) + if path.exists(): + target_type = "config" if path.suffix in {".yaml", ".yml"} else "path" + else: + target_type = "dataset" + + return target, target_type, str(checkout_dir) if checkout_dir else None + + +def _target_args(target: str, target_type: str) -> list[str]: + if target_type == "dataset": + return ["-d", target] + if target_type == "config": + return ["-c", target] + if target_type == "path": + return ["-p", target] + raise ValueError(f"Unsupported Harbor target type: {target_type}") + + +def _parse_key_value(values: list[str]) -> list[str]: + for value in values: + if "=" not in value: + raise ValueError(f"Expected KEY=VALUE, got {value!r}") + return values + + +def _split_json_values(raw: str | None) -> list[str]: + if not raw: + return [] + data = json.loads(raw) + if isinstance(data, dict): + return [f"{key}={value}" for key, value in data.items()] + if isinstance(data, list) and all(isinstance(item, str) for item in data): + return data + raise ValueError("Expected a JSON object or list of KEY=VALUE strings") + + +def run_harbor(args: argparse.Namespace, llm: LLM, output_dir: str) -> Path: + target, target_type, checkout_dir = _resolve_target(args) + harbor_output_dir = Path(output_dir) / "harbor_output" + harbor_output_dir.mkdir(parents=True, exist_ok=True) + + cmd = [ + args.harbor_executable, + "run", + *_target_args(target, target_type), + "-a", + args.harbor_agent, + "-m", + llm.model, + "--jobs-dir", + str(harbor_output_dir.resolve()), + "--n-concurrent", + str(args.num_workers), + ] + + if llm.api_key: + cmd.extend(["--ae", f"LLM_API_KEY={_secret_value(llm.api_key)}"]) + if llm.base_url: + cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"]) + for env_value in _parse_key_value( + [*args.agent_env, *_split_json_values(args.agent_env_json)] + ): + cmd.extend(["--ae", env_value]) + for kwarg_value in _parse_key_value( + [*args.agent_kwarg, *_split_json_values(args.agent_kwarg_json)] + ): + cmd.extend(["--ak", kwarg_value]) + for task_id in args.task_id or []: + task_value = task_id.rsplit("/", 1)[-1] if target_type == "path" else task_id + cmd.extend([args.task_filter_flag, task_value]) + if args.n_limit is not None: + cmd.extend(["--n-tasks", str(args.n_limit)]) + for extra_arg in args.harbor_arg: + cmd.extend(shlex.split(extra_arg)) + + safe_cmd = [ + "***" if prev == "--ae" and part.split("=", 1)[0].endswith("KEY") else part + for prev, part in zip([""] + cmd, cmd) + ] + logger.info("Running Harbor command: %s", " ".join(safe_cmd)) + if checkout_dir: + logger.info("Using Harbor adapter checkout: %s", checkout_dir) + + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode != 0: + logger.error("Harbor stdout: %s", result.stdout) + logger.error("Harbor stderr: %s", result.stderr) + raise RuntimeError( + f"Harbor run failed with exit code {result.returncode}: {result.stderr}" + ) + logger.info("Harbor stdout: %s", result.stdout) + return harbor_output_dir + + +def _parser() -> argparse.ArgumentParser: + parser = argparse.ArgumentParser( + description="Run a generic Harbor evaluation with OpenHands SDK" + ) + parser.add_argument("llm_config_path") + parser.add_argument( + "--harbor-target", help="Harbor dataset name, config path, or dataset path" + ) + parser.add_argument( + "--harbor-target-type", + choices=["auto", "dataset", "config", "path"], + default="auto", + ) + parser.add_argument( + "--harbor-adapter-repo", help="Git repository containing the Harbor adapter" + ) + parser.add_argument("--harbor-adapter-ref", help="Git ref/SHA/tag for adapter repo") + parser.add_argument( + "--harbor-adapter-path", + help="Path inside adapter repo to a Harbor YAML config or dataset directory", + ) + parser.add_argument("--harbor-agent", default="openhands-sdk") + parser.add_argument("--harbor-executable", default="harbor") + parser.add_argument("--task-filter-flag", default="--include-task-name") + parser.add_argument( + "--agent-env", action="append", default=[], help="KEY=VALUE passed as --ae" + ) + parser.add_argument("--agent-env-json", help="JSON object passed as repeated --ae") + parser.add_argument( + "--agent-kwarg", action="append", default=[], help="KEY=VALUE passed as --ak" + ) + parser.add_argument("--agent-kwarg-json", help="JSON object passed as repeated --ak") + parser.add_argument( + "--harbor-arg", action="append", default=[], help="Additional raw Harbor args" + ) + parser.add_argument("--benchmark-slug", default="harbor") + parser.add_argument("--output-dir", default="./evaluation_outputs") + parser.add_argument("--num-workers", type=int, default=1) + parser.add_argument("--n-limit", type=int) + parser.add_argument("--select", help="Text file containing task IDs") + parser.add_argument("--task-id", action="append") + parser.add_argument("--note") + parser.add_argument("--skip-harbor", action="store_true") + return parser + + +def main() -> None: + args = _parser().parse_args() + + if not os.path.isfile(args.llm_config_path): + logger.error("LLM config file does not exist: %s", args.llm_config_path) + sys.exit(1) + with open(args.llm_config_path, encoding="utf-8") as f: + llm = LLM.model_validate_json(f.read()) + + if args.select: + args.task_id = [*(args.task_id or []), *_load_task_ids(args.select)] + + if not args.skip_harbor and not check_harbor_installed(args.harbor_executable): + logger.error("Harbor CLI is not installed; install with `pip install harbor`.") + sys.exit(1) + + structured_output_dir = construct_eval_output_dir( + base_dir=args.output_dir, + dataset_name=args.benchmark_slug, + model_name=llm.model, + max_iterations=100, + eval_note=args.note, + ) + os.makedirs(structured_output_dir, exist_ok=True) + + target, target_type, checkout_dir = _resolve_target(args) + metadata = { + "llm": llm.model_dump_json(), + "benchmark": args.benchmark_slug, + "harbor_target": target, + "harbor_target_type": target_type, + "harbor_adapter_repo": args.harbor_adapter_repo, + "harbor_adapter_ref": args.harbor_adapter_ref, + "harbor_adapter_path": args.harbor_adapter_path, + "harbor_adapter_checkout": checkout_dir, + "harbor_agent": args.harbor_agent, + "timestamp": datetime.now(timezone.utc).isoformat(), + "note": args.note, + } + with open(Path(structured_output_dir) / "metadata.json", "w", encoding="utf-8") as f: + json.dump(metadata, f, indent=2) + + output_path = Path(structured_output_dir) / OUTPUT_FILENAME + try: + harbor_output_dir = ( + Path(structured_output_dir) / "harbor_output" + if args.skip_harbor + else run_harbor(args, llm, structured_output_dir) + ) + convert_harbor_to_eval_output( + harbor_output_dir=harbor_output_dir, eval_output_path=output_path + ) + except Exception as exc: + logger.error("Harbor inference failed: %s", exc) + sys.exit(1) + + if output_path.exists(): + generate_cost_report(str(output_path)) + print(json.dumps({"output_json": str(output_path)})) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index c29e07cfd..ce3d03192 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,6 +54,8 @@ dependencies = [ [project.scripts] validate-cfg = "benchmarks.scripts.validate_cfg:main" +harbor-infer = "benchmarks.harbor.run_infer:main" +harbor-eval = "benchmarks.harbor.eval_infer:main" swebench-infer = "benchmarks.swebench.run_infer:main" swtbench-infer = "benchmarks.swtbench.run_infer:main" swebench-eval = "benchmarks.swebench.eval_infer:main" From 11d10ff2e9aa4328ac6b9262a3588fb6b5e891f8 Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Mon, 22 Jun 2026 11:13:39 -0400 Subject: [PATCH 2/8] Fix LaminarService call in eval_infer to use correct method and make telemetry non-fatal Co-authored-by: openhands --- benchmarks/harbor/eval_infer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/benchmarks/harbor/eval_infer.py b/benchmarks/harbor/eval_infer.py index d92f44d58..dcdb140c0 100644 --- a/benchmarks/harbor/eval_infer.py +++ b/benchmarks/harbor/eval_infer.py @@ -143,11 +143,15 @@ def main() -> None: try: report = process_harbor_results(str(input_file), str(output_file)) generate_cost_report(str(input_file)) - LaminarService.send_eval_report(report, str(input_file)) except Exception as exc: logger.error("Harbor evaluation failed: %s", exc) sys.exit(1) + try: + LaminarService.update_evaluation_scores(report, str(input_file)) + except Exception as exc: + logger.warning("Laminar telemetry reporting failed (non-fatal): %s", exc) + print(json.dumps({"report_json": str(output_file)})) From 131975bbed05951b0e7390d840fae86a12844161 Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Mon, 22 Jun 2026 18:07:01 -0400 Subject: [PATCH 3/8] fix: resolve ruff format/lint and pyright issues in harbor eval (#755) - Run ruff format on eval_infer.py and run_infer.py (line wrapping) - Fix LaminarService.update_evaluation_scores call to use singleton instance via get() with correct arguments (output_file, report_file) instead of calling unbound method with dict as self Co-authored-by: openhands --- benchmarks/harbor/eval_infer.py | 8 ++++++-- benchmarks/harbor/run_infer.py | 14 ++++++++++---- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/benchmarks/harbor/eval_infer.py b/benchmarks/harbor/eval_infer.py index dcdb140c0..78a4290cd 100644 --- a/benchmarks/harbor/eval_infer.py +++ b/benchmarks/harbor/eval_infer.py @@ -138,7 +138,9 @@ def main() -> None: sys.exit(1) output_file = ( - Path(args.output_file) if args.output_file else input_file.with_suffix(".report.json") + Path(args.output_file) + if args.output_file + else input_file.with_suffix(".report.json") ) try: report = process_harbor_results(str(input_file), str(output_file)) @@ -148,7 +150,9 @@ def main() -> None: sys.exit(1) try: - LaminarService.update_evaluation_scores(report, str(input_file)) + LaminarService.get().update_evaluation_scores( + str(input_file), str(output_file) + ) except Exception as exc: logger.warning("Laminar telemetry reporting failed (non-fatal): %s", exc) diff --git a/benchmarks/harbor/run_infer.py b/benchmarks/harbor/run_infer.py index 2f537733e..dd8a21e98 100644 --- a/benchmarks/harbor/run_infer.py +++ b/benchmarks/harbor/run_infer.py @@ -13,10 +13,12 @@ import tempfile from datetime import datetime, timezone from pathlib import Path -from typing import Any from benchmarks.utils.evaluation_utils import construct_eval_output_dir -from benchmarks.utils.harbor import check_harbor_installed, convert_harbor_to_eval_output +from benchmarks.utils.harbor import ( + check_harbor_installed, + convert_harbor_to_eval_output, +) from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import LLM, get_logger @@ -216,7 +218,9 @@ def _parser() -> argparse.ArgumentParser: parser.add_argument( "--agent-kwarg", action="append", default=[], help="KEY=VALUE passed as --ak" ) - parser.add_argument("--agent-kwarg-json", help="JSON object passed as repeated --ak") + parser.add_argument( + "--agent-kwarg-json", help="JSON object passed as repeated --ak" + ) parser.add_argument( "--harbor-arg", action="append", default=[], help="Additional raw Harbor args" ) @@ -270,7 +274,9 @@ def main() -> None: "timestamp": datetime.now(timezone.utc).isoformat(), "note": args.note, } - with open(Path(structured_output_dir) / "metadata.json", "w", encoding="utf-8") as f: + with open( + Path(structured_output_dir) / "metadata.json", "w", encoding="utf-8" + ) as f: json.dump(metadata, f, indent=2) output_path = Path(structured_output_dir) / OUTPUT_FILENAME From 5349fde0c5621ca6ee410133bbf466bc897e2d61 Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Mon, 22 Jun 2026 18:10:53 -0400 Subject: [PATCH 4/8] fix: match CI ruff formatting and remove unused variable (#755) - Use single-line LaminarService call to match ruff 0.13.0 formatting - Remove unused 'report' variable assignment Co-authored-by: openhands --- benchmarks/harbor/eval_infer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/harbor/eval_infer.py b/benchmarks/harbor/eval_infer.py index 78a4290cd..fe4c33473 100644 --- a/benchmarks/harbor/eval_infer.py +++ b/benchmarks/harbor/eval_infer.py @@ -143,16 +143,14 @@ def main() -> None: else input_file.with_suffix(".report.json") ) try: - report = process_harbor_results(str(input_file), str(output_file)) + process_harbor_results(str(input_file), str(output_file)) generate_cost_report(str(input_file)) except Exception as exc: logger.error("Harbor evaluation failed: %s", exc) sys.exit(1) try: - LaminarService.get().update_evaluation_scores( - str(input_file), str(output_file) - ) + LaminarService.get().update_evaluation_scores(str(input_file), str(output_file)) except Exception as exc: logger.warning("Laminar telemetry reporting failed (non-fatal): %s", exc) From 968064f505ad1caf6d8138b7f9065e58218858aa Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Mon, 22 Jun 2026 18:20:46 -0400 Subject: [PATCH 5/8] fix: address review comments on harbor run_infer (#755) - Remove duplicate _secret_value; import from benchmarks.utils.harbor - Fix double _resolve_target call: pass resolved target/target_type/ checkout_dir into run_harbor instead of re-resolving (avoids double git clone when --harbor-adapter-repo is set) - Clean up adapter checkout temp dir in finally block after run - Add comment explaining max_iterations=100 standard cap - Add tests for harbor run_infer helpers (17 tests) Co-authored-by: openhands --- benchmarks/harbor/run_infer.py | 25 ++++-- tests/test_harbor_run_infer.py | 145 +++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 9 deletions(-) create mode 100644 tests/test_harbor_run_infer.py diff --git a/benchmarks/harbor/run_infer.py b/benchmarks/harbor/run_infer.py index dd8a21e98..fe06db59d 100644 --- a/benchmarks/harbor/run_infer.py +++ b/benchmarks/harbor/run_infer.py @@ -16,6 +16,7 @@ from benchmarks.utils.evaluation_utils import construct_eval_output_dir from benchmarks.utils.harbor import ( + _secret_value, check_harbor_installed, convert_harbor_to_eval_output, ) @@ -38,12 +39,6 @@ def _load_task_ids(filepath: str) -> list[str]: return task_ids -def _secret_value(value: object) -> str: - if hasattr(value, "get_secret_value"): - return value.get_secret_value() # type: ignore[no-any-return, attr-defined] - return str(value) - - def _checkout_adapter(repo: str, ref: str | None) -> Path: checkout_dir = Path(tempfile.mkdtemp(prefix="harbor-adapter-")) cmd = ["git", "clone", "--depth", "1"] @@ -129,8 +124,14 @@ def _split_json_values(raw: str | None) -> list[str]: raise ValueError("Expected a JSON object or list of KEY=VALUE strings") -def run_harbor(args: argparse.Namespace, llm: LLM, output_dir: str) -> Path: - target, target_type, checkout_dir = _resolve_target(args) +def run_harbor( + args: argparse.Namespace, + llm: LLM, + output_dir: str, + target: str, + target_type: str, + checkout_dir: str | None = None, +) -> Path: harbor_output_dir = Path(output_dir) / "harbor_output" harbor_output_dir.mkdir(parents=True, exist_ok=True) @@ -255,6 +256,7 @@ def main() -> None: base_dir=args.output_dir, dataset_name=args.benchmark_slug, model_name=llm.model, + # Standard iteration cap used by all benchmark runners in this repo max_iterations=100, eval_note=args.note, ) @@ -284,7 +286,9 @@ def main() -> None: harbor_output_dir = ( Path(structured_output_dir) / "harbor_output" if args.skip_harbor - else run_harbor(args, llm, structured_output_dir) + else run_harbor( + args, llm, structured_output_dir, target, target_type, checkout_dir + ) ) convert_harbor_to_eval_output( harbor_output_dir=harbor_output_dir, eval_output_path=output_path @@ -292,6 +296,9 @@ def main() -> None: except Exception as exc: logger.error("Harbor inference failed: %s", exc) sys.exit(1) + finally: + if checkout_dir: + shutil.rmtree(checkout_dir, ignore_errors=True) if output_path.exists(): generate_cost_report(str(output_path)) diff --git a/tests/test_harbor_run_infer.py b/tests/test_harbor_run_infer.py new file mode 100644 index 000000000..44bc9b440 --- /dev/null +++ b/tests/test_harbor_run_infer.py @@ -0,0 +1,145 @@ +"""Tests for the generic Harbor run_infer helpers.""" + +from __future__ import annotations + +import argparse +from pathlib import Path +from unittest.mock import patch + +import pytest + +from benchmarks.harbor.run_infer import ( + _load_task_ids, + _parse_key_value, + _resolve_target, + _split_json_values, + _target_args, +) + + +def test_load_task_ids_strips_and_ignores_comments(tmp_path: Path) -> None: + f = tmp_path / "tasks.txt" + f.write_text("# comment\n task_a \n\ntask_b\n", encoding="utf-8") + assert _load_task_ids(str(f)) == ["task_a", "task_b"] + + +def test_target_args_dataset() -> None: + assert _target_args("foo", "dataset") == ["-d", "foo"] + + +def test_target_args_config() -> None: + assert _target_args("foo.yaml", "config") == ["-c", "foo.yaml"] + + +def test_target_args_path() -> None: + assert _target_args("some/path", "path") == ["-p", "some/path"] + + +def test_target_args_invalid() -> None: + with pytest.raises(ValueError, match="Unsupported Harbor target type"): + _target_args("foo", "bogus") + + +def test_parse_key_value_ok() -> None: + assert _parse_key_value(["A=1", "B=2"]) == ["A=1", "B=2"] + + +def test_parse_key_value_missing_equals() -> None: + with pytest.raises(ValueError, match="Expected KEY=VALUE"): + _parse_key_value(["bad"]) + + +def test_split_json_values_none() -> None: + assert _split_json_values(None) == [] + + +def test_split_json_values_dict() -> None: + assert _split_json_values('{"A": "1", "B": "2"}') == ["A=1", "B=2"] + + +def test_split_json_values_list() -> None: + assert _split_json_values('["A=1", "B=2"]') == ["A=1", "B=2"] + + +def test_split_json_values_invalid() -> None: + with pytest.raises(ValueError, match="Expected a JSON object or list"): + _split_json_values('"not-an-object-or-list"') + + +def _make_args(**kwargs: object) -> argparse.Namespace: + defaults = dict( + harbor_target=None, + harbor_target_type="auto", + harbor_adapter_repo=None, + harbor_adapter_ref=None, + harbor_adapter_path=None, + ) + defaults.update(kwargs) + return argparse.Namespace(**defaults) + + +def test_resolve_target_requires_target() -> None: + with pytest.raises( + RuntimeError, match="A Harbor target or adapter path is required" + ): + _resolve_target(_make_args()) + + +def test_resolve_target_dataset_auto() -> None: + # When target doesn't exist on disk and type is auto, defaults to dataset + target, target_type, checkout = _resolve_target( + _make_args(harbor_target="my-dataset") + ) + assert target == "my-dataset" + assert target_type == "dataset" + assert checkout is None + + +def test_resolve_target_config_auto(tmp_path: Path) -> None: + cfg = tmp_path / "config.yaml" + cfg.write_text("foo: bar", encoding="utf-8") + target, target_type, checkout = _resolve_target(_make_args(harbor_target=str(cfg))) + assert target == str(cfg) + assert target_type == "config" + assert checkout is None + + +def test_resolve_target_path_auto(tmp_path: Path) -> None: + p = tmp_path / "some_dir" + p.mkdir() + target, target_type, checkout = _resolve_target(_make_args(harbor_target=str(p))) + assert target == str(p) + assert target_type == "path" + assert checkout is None + + +def test_resolve_target_explicit_type() -> None: + target, target_type, checkout = _resolve_target( + _make_args(harbor_target="ds", harbor_target_type="dataset") + ) + assert target == "ds" + assert target_type == "dataset" + assert checkout is None + + +def test_resolve_target_adapter_path(tmp_path: Path) -> None: + """Adapter path resolution clones the repo and resolves the target inside it.""" + fake_checkout = tmp_path / "fake-clone" + fake_checkout.mkdir() + cfg_inside = fake_checkout / "adapter.yaml" + cfg_inside.write_text("foo: bar", encoding="utf-8") + + with patch( + "benchmarks.harbor.run_infer._checkout_adapter", + return_value=fake_checkout, + ): + target, target_type, checkout = _resolve_target( + _make_args( + harbor_adapter_repo="https://example.com/repo.git", + harbor_adapter_path="adapter.yaml", + ) + ) + + assert target == str(cfg_inside) + assert target_type == "config" + assert checkout == str(fake_checkout) From 1fb976c44f7a5481c492503bb7cd2b11eb82d293 Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Mon, 22 Jun 2026 18:22:30 -0400 Subject: [PATCH 6/8] fix: pyright type error in test helper (#755) Use .items() for dict update to satisfy pyright strict type checking. Co-authored-by: openhands --- tests/test_harbor_run_infer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_harbor_run_infer.py b/tests/test_harbor_run_infer.py index 44bc9b440..eee194c7d 100644 --- a/tests/test_harbor_run_infer.py +++ b/tests/test_harbor_run_infer.py @@ -67,14 +67,14 @@ def test_split_json_values_invalid() -> None: def _make_args(**kwargs: object) -> argparse.Namespace: - defaults = dict( + defaults: dict[str, object] = dict( harbor_target=None, harbor_target_type="auto", harbor_adapter_repo=None, harbor_adapter_ref=None, harbor_adapter_path=None, ) - defaults.update(kwargs) + defaults.update(kwargs.items()) return argparse.Namespace(**defaults) From cea63d6227510dbe4d688f46ae6a227e5a268e9e Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Mon, 22 Jun 2026 18:44:14 -0400 Subject: [PATCH 7/8] =?UTF-8?q?fix:=20address=202nd=20round=20review=20?= =?UTF-8?q?=E2=80=94=20adapter=20reproducibility=20+=20secret=20masking=20?= =?UTF-8?q?(#755)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical fixes from 2nd AI review: - Record resolved adapter commit SHA in metadata for reproducibility - Warn when cloning adapter repo without a pinned ref - Broaden secret masking to cover TOKEN, SECRET, PASSWORD, CREDENTIAL, PASSPHRASE patterns (not just *KEY suffix), preventing GITHUB_TOKEN, HF_TOKEN, etc. from being logged in cleartext - Add _is_sensitive_env_value helper with 7 tests - Update _resolve_target to return 4-tuple including adapter_sha - Update all tests for new signature (24 tests total) Co-authored-by: openhands --- benchmarks/harbor/run_infer.py | 59 +++++++++++++++++++++++++++++----- tests/test_harbor_run_infer.py | 53 ++++++++++++++++++++++++++---- 2 files changed, 98 insertions(+), 14 deletions(-) diff --git a/benchmarks/harbor/run_infer.py b/benchmarks/harbor/run_infer.py index fe06db59d..5957cbf46 100644 --- a/benchmarks/harbor/run_infer.py +++ b/benchmarks/harbor/run_infer.py @@ -39,7 +39,17 @@ def _load_task_ids(filepath: str) -> list[str]: return task_ids -def _checkout_adapter(repo: str, ref: str | None) -> Path: +def _checkout_adapter(repo: str, ref: str | None) -> tuple[Path, str]: + """Clone the adapter repo and return (checkout_dir, resolved_commit_sha). + + The resolved SHA is captured so metadata can record exactly which commit + was evaluated, making runs reproducible even when ``ref`` is unset. + """ + if not ref: + logger.warning( + "Cloning adapter repo without a pinned ref; results may not be " + "reproducible. Pass --harbor-adapter-ref to pin a tag/SHA/branch." + ) checkout_dir = Path(tempfile.mkdtemp(prefix="harbor-adapter-")) cmd = ["git", "clone", "--depth", "1"] if ref: @@ -62,17 +72,26 @@ def _checkout_adapter(repo: str, ref: str | None) -> Path: ) if result.returncode != 0: raise RuntimeError(f"Failed to checkout Harbor adapter repo: {result.stderr}") - return checkout_dir + sha_result = subprocess.run( + ["git", "rev-parse", "HEAD"], cwd=checkout_dir, capture_output=True, text=True + ) + resolved_sha = ( + sha_result.stdout.strip() if sha_result.returncode == 0 else "unknown" + ) + return checkout_dir, resolved_sha -def _resolve_target(args: argparse.Namespace) -> tuple[str, str, str | None]: +def _resolve_target( + args: argparse.Namespace, +) -> tuple[str, str, str | None, str | None]: checkout_dir: Path | None = None + adapter_sha: str | None = None target = args.harbor_target target_type = args.harbor_target_type if args.harbor_adapter_repo or args.harbor_adapter_path: repo = args.harbor_adapter_repo or DEFAULT_ADAPTER_REPO - checkout_dir = _checkout_adapter(repo, args.harbor_adapter_ref) + checkout_dir, adapter_sha = _checkout_adapter(repo, args.harbor_adapter_ref) if args.harbor_adapter_path: target_path = checkout_dir / args.harbor_adapter_path if not target_path.exists(): @@ -93,7 +112,23 @@ def _resolve_target(args: argparse.Namespace) -> tuple[str, str, str | None]: else: target_type = "dataset" - return target, target_type, str(checkout_dir) if checkout_dir else None + return ( + target, + target_type, + str(checkout_dir) if checkout_dir else None, + adapter_sha, + ) + + +SECRET_KEY_PATTERNS = ("KEY", "TOKEN", "SECRET", "PASSWORD", "CREDENTIAL", "PASSPHRASE") + + +def _is_sensitive_env_value(prev: str, part: str) -> bool: + """Return True if ``part`` is a value following ``--ae`` whose key looks secret.""" + if prev != "--ae": + return False + key = part.split("=", 1)[0].upper() + return any(pat in key for pat in SECRET_KEY_PATTERNS) def _target_args(target: str, target_type: str) -> list[str]: @@ -131,6 +166,7 @@ def run_harbor( target: str, target_type: str, checkout_dir: str | None = None, + adapter_sha: str | None = None, ) -> Path: harbor_output_dir = Path(output_dir) / "harbor_output" harbor_output_dir.mkdir(parents=True, exist_ok=True) @@ -170,7 +206,7 @@ def run_harbor( cmd.extend(shlex.split(extra_arg)) safe_cmd = [ - "***" if prev == "--ae" and part.split("=", 1)[0].endswith("KEY") else part + "***" if _is_sensitive_env_value(prev, part) else part for prev, part in zip([""] + cmd, cmd) ] logger.info("Running Harbor command: %s", " ".join(safe_cmd)) @@ -262,7 +298,7 @@ def main() -> None: ) os.makedirs(structured_output_dir, exist_ok=True) - target, target_type, checkout_dir = _resolve_target(args) + target, target_type, checkout_dir, adapter_sha = _resolve_target(args) metadata = { "llm": llm.model_dump_json(), "benchmark": args.benchmark_slug, @@ -270,6 +306,7 @@ def main() -> None: "harbor_target_type": target_type, "harbor_adapter_repo": args.harbor_adapter_repo, "harbor_adapter_ref": args.harbor_adapter_ref, + "harbor_adapter_resolved_sha": adapter_sha, "harbor_adapter_path": args.harbor_adapter_path, "harbor_adapter_checkout": checkout_dir, "harbor_agent": args.harbor_agent, @@ -287,7 +324,13 @@ def main() -> None: Path(structured_output_dir) / "harbor_output" if args.skip_harbor else run_harbor( - args, llm, structured_output_dir, target, target_type, checkout_dir + args, + llm, + structured_output_dir, + target, + target_type, + checkout_dir, + adapter_sha, ) ) convert_harbor_to_eval_output( diff --git a/tests/test_harbor_run_infer.py b/tests/test_harbor_run_infer.py index eee194c7d..dfc55b384 100644 --- a/tests/test_harbor_run_infer.py +++ b/tests/test_harbor_run_infer.py @@ -9,6 +9,7 @@ import pytest from benchmarks.harbor.run_infer import ( + _is_sensitive_env_value, _load_task_ids, _parse_key_value, _resolve_target, @@ -87,39 +88,47 @@ def test_resolve_target_requires_target() -> None: def test_resolve_target_dataset_auto() -> None: # When target doesn't exist on disk and type is auto, defaults to dataset - target, target_type, checkout = _resolve_target( + target, target_type, checkout, sha = _resolve_target( _make_args(harbor_target="my-dataset") ) assert target == "my-dataset" assert target_type == "dataset" assert checkout is None + assert sha is None def test_resolve_target_config_auto(tmp_path: Path) -> None: cfg = tmp_path / "config.yaml" cfg.write_text("foo: bar", encoding="utf-8") - target, target_type, checkout = _resolve_target(_make_args(harbor_target=str(cfg))) + target, target_type, checkout, sha = _resolve_target( + _make_args(harbor_target=str(cfg)) + ) assert target == str(cfg) assert target_type == "config" assert checkout is None + assert sha is None def test_resolve_target_path_auto(tmp_path: Path) -> None: p = tmp_path / "some_dir" p.mkdir() - target, target_type, checkout = _resolve_target(_make_args(harbor_target=str(p))) + target, target_type, checkout, sha = _resolve_target( + _make_args(harbor_target=str(p)) + ) assert target == str(p) assert target_type == "path" assert checkout is None + assert sha is None def test_resolve_target_explicit_type() -> None: - target, target_type, checkout = _resolve_target( + target, target_type, checkout, sha = _resolve_target( _make_args(harbor_target="ds", harbor_target_type="dataset") ) assert target == "ds" assert target_type == "dataset" assert checkout is None + assert sha is None def test_resolve_target_adapter_path(tmp_path: Path) -> None: @@ -131,9 +140,9 @@ def test_resolve_target_adapter_path(tmp_path: Path) -> None: with patch( "benchmarks.harbor.run_infer._checkout_adapter", - return_value=fake_checkout, + return_value=(fake_checkout, "abc123def456"), ): - target, target_type, checkout = _resolve_target( + target, target_type, checkout, sha = _resolve_target( _make_args( harbor_adapter_repo="https://example.com/repo.git", harbor_adapter_path="adapter.yaml", @@ -143,3 +152,35 @@ def test_resolve_target_adapter_path(tmp_path: Path) -> None: assert target == str(cfg_inside) assert target_type == "config" assert checkout == str(fake_checkout) + assert sha == "abc123def456" + + +# --- Secret masking tests --- + + +def test_is_sensitive_env_value_key_suffix() -> None: + assert _is_sensitive_env_value("--ae", "LLM_API_KEY=secret123") + + +def test_is_sensitive_env_value_token() -> None: + assert _is_sensitive_env_value("--ae", "GITHUB_TOKEN=ghp_xxx") + + +def test_is_sensitive_env_value_secret() -> None: + assert _is_sensitive_env_value("--ae", "HF_SECRET=hf_xxx") + + +def test_is_sensitive_env_value_password() -> None: + assert _is_sensitive_env_value("--ae", "DB_PASSWORD=p455w0rd") + + +def test_is_sensitive_env_value_not_ae() -> None: + assert not _is_sensitive_env_value("--ak", "GITHUB_TOKEN=ghp_xxx") + + +def test_is_sensitive_env_value_non_secret() -> None: + assert not _is_sensitive_env_value("--ae", "LLM_BASE_URL=https://api.example.com") + + +def test_is_sensitive_env_value_non_secret_env() -> None: + assert not _is_sensitive_env_value("--ae", "MAX_ITERATIONS=100") From c3227bc2dc43676ff2cabcae62562f12f3d66b8f Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Mon, 22 Jun 2026 18:58:54 -0400 Subject: [PATCH 8/8] fix: extend secret masking to --ak kwargs (#755) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 3rd AI review found that --agent-kwarg/--agent-kwarg-json values (--ak) were not masked, allowing GITHUB_TOKEN etc. to leak in logs. - Rename _is_sensitive_env_value → _is_sensitive_value - Check both --ae and --ak flags for sensitive key patterns - Update test that incorrectly asserted --ak was not sensitive - Add regression tests for --ak masking (27 tests total) Co-authored-by: openhands --- benchmarks/harbor/run_infer.py | 9 +++---- tests/test_harbor_run_infer.py | 43 ++++++++++++++++++++++------------ 2 files changed, 33 insertions(+), 19 deletions(-) diff --git a/benchmarks/harbor/run_infer.py b/benchmarks/harbor/run_infer.py index 5957cbf46..bef91c423 100644 --- a/benchmarks/harbor/run_infer.py +++ b/benchmarks/harbor/run_infer.py @@ -121,11 +121,12 @@ def _resolve_target( SECRET_KEY_PATTERNS = ("KEY", "TOKEN", "SECRET", "PASSWORD", "CREDENTIAL", "PASSPHRASE") +SENSITIVE_VALUE_FLAGS = ("--ae", "--ak") -def _is_sensitive_env_value(prev: str, part: str) -> bool: - """Return True if ``part`` is a value following ``--ae`` whose key looks secret.""" - if prev != "--ae": +def _is_sensitive_value(prev: str, part: str) -> bool: + """Return True if ``part`` is a value following ``--ae``/``--ak`` whose key looks secret.""" + if prev not in SENSITIVE_VALUE_FLAGS: return False key = part.split("=", 1)[0].upper() return any(pat in key for pat in SECRET_KEY_PATTERNS) @@ -206,7 +207,7 @@ def run_harbor( cmd.extend(shlex.split(extra_arg)) safe_cmd = [ - "***" if _is_sensitive_env_value(prev, part) else part + "***" if _is_sensitive_value(prev, part) else part for prev, part in zip([""] + cmd, cmd) ] logger.info("Running Harbor command: %s", " ".join(safe_cmd)) diff --git a/tests/test_harbor_run_infer.py b/tests/test_harbor_run_infer.py index dfc55b384..7d2d4167f 100644 --- a/tests/test_harbor_run_infer.py +++ b/tests/test_harbor_run_infer.py @@ -9,7 +9,7 @@ import pytest from benchmarks.harbor.run_infer import ( - _is_sensitive_env_value, + _is_sensitive_value, _load_task_ids, _parse_key_value, _resolve_target, @@ -158,29 +158,42 @@ def test_resolve_target_adapter_path(tmp_path: Path) -> None: # --- Secret masking tests --- -def test_is_sensitive_env_value_key_suffix() -> None: - assert _is_sensitive_env_value("--ae", "LLM_API_KEY=secret123") +def test_is_sensitive_value_key_suffix() -> None: + assert _is_sensitive_value("--ae", "LLM_API_KEY=secret123") -def test_is_sensitive_env_value_token() -> None: - assert _is_sensitive_env_value("--ae", "GITHUB_TOKEN=ghp_xxx") +def test_is_sensitive_value_token() -> None: + assert _is_sensitive_value("--ae", "GITHUB_TOKEN=ghp_xxx") -def test_is_sensitive_env_value_secret() -> None: - assert _is_sensitive_env_value("--ae", "HF_SECRET=hf_xxx") +def test_is_sensitive_value_secret() -> None: + assert _is_sensitive_value("--ae", "HF_SECRET=hf_xxx") -def test_is_sensitive_env_value_password() -> None: - assert _is_sensitive_env_value("--ae", "DB_PASSWORD=p455w0rd") +def test_is_sensitive_value_password() -> None: + assert _is_sensitive_value("--ae", "DB_PASSWORD=p455w0rd") -def test_is_sensitive_env_value_not_ae() -> None: - assert not _is_sensitive_env_value("--ak", "GITHUB_TOKEN=ghp_xxx") +def test_is_sensitive_value_ak_token() -> None: + """--ak values with secret-like keys must also be masked (regression test).""" + assert _is_sensitive_value("--ak", "GITHUB_TOKEN=ghp_xxx") -def test_is_sensitive_env_value_non_secret() -> None: - assert not _is_sensitive_env_value("--ae", "LLM_BASE_URL=https://api.example.com") +def test_is_sensitive_value_ak_key_suffix() -> None: + assert _is_sensitive_value("--ak", "API_KEY=secret123") -def test_is_sensitive_env_value_non_secret_env() -> None: - assert not _is_sensitive_env_value("--ae", "MAX_ITERATIONS=100") +def test_is_sensitive_value_ak_non_secret() -> None: + assert not _is_sensitive_value("--ak", "MAX_ITERATIONS=100") + + +def test_is_sensitive_value_unknown_flag() -> None: + assert not _is_sensitive_value("--foo", "GITHUB_TOKEN=ghp_xxx") + + +def test_is_sensitive_value_non_secret() -> None: + assert not _is_sensitive_value("--ae", "LLM_BASE_URL=https://api.example.com") + + +def test_is_sensitive_value_non_secret_env() -> None: + assert not _is_sensitive_value("--ae", "MAX_ITERATIONS=100")