Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions benchmarks/harbor/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Generic Harbor-backed benchmark runner."""
161 changes: 161 additions & 0 deletions benchmarks/harbor/eval_infer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
#!/usr/bin/env python3
"""Generic Harbor evaluation report generator."""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Any

from benchmarks.utils.laminar import LaminarService
from benchmarks.utils.report_costs import generate_cost_report
from openhands.sdk import get_logger


logger = get_logger(__name__)


def _metric(
data: dict[str, Any], test_result: dict[str, Any], key: str, default: Any
) -> Any:
metrics = data.get("metrics", {})
value = metrics.get(key)
if value is not None:
return value
return test_result.get("final_metrics", {}).get(key, default)


def process_harbor_results(input_file: str, output_file: str) -> dict[str, Any]:
completed_ids: set[str] = set()
resolved_ids: set[str] = set()
unresolved_ids: set[str] = set()
incomplete_ids: set[str] = set()
error_ids: set[str] = set()

total_cost_usd = 0.0
total_prompt_tokens = 0
total_completion_tokens = 0

with open(input_file, encoding="utf-8") as infile:
for line_num, line in enumerate(infile, 1):
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
except json.JSONDecodeError as exc:
logger.error("Line %d: invalid JSON: %s", line_num, exc)
continue

instance_id = data.get("instance_id")
if not instance_id:
logger.warning("Line %d: missing instance_id", line_num)
continue
if instance_id in completed_ids or instance_id in incomplete_ids:
logger.warning(
"Line %d: duplicate instance_id %s", line_num, instance_id
)
continue

if data.get("error"):
error_ids.add(instance_id)
incomplete_ids.add(instance_id)
continue

test_result = data.get("test_result", {})
completed_ids.add(instance_id)
if test_result.get("passed") is True:
resolved_ids.add(instance_id)
else:
unresolved_ids.add(instance_id)

total_cost_usd += float(
_metric(data, test_result, "total_cost_usd", 0.0) or 0.0
)
total_prompt_tokens += int(
_metric(data, test_result, "total_prompt_tokens", 0) or 0
)
total_completion_tokens += int(
_metric(data, test_result, "total_completion_tokens", 0) or 0
)

error_path = Path(input_file).with_name(f"{Path(input_file).stem}_errors.jsonl")
if error_path.exists():
with open(error_path, encoding="utf-8") as error_file:
for line in error_file:
try:
data = json.loads(line)
except json.JSONDecodeError:
continue
instance_id = data.get("instance_id")
if instance_id and instance_id not in completed_ids:
incomplete_ids.add(instance_id)
error_ids.add(instance_id)

submitted_ids = completed_ids | incomplete_ids
report: dict[str, Any] = {
"total_instances": len(submitted_ids),
"submitted_instances": len(submitted_ids),
"completed_instances": len(completed_ids),
"incomplete_instances": len(incomplete_ids),
"resolved_instances": len(resolved_ids),
"unresolved_instances": len(unresolved_ids),
"error_instances": len(error_ids),
"submitted_ids": sorted(submitted_ids),
"completed_ids": sorted(completed_ids),
"incomplete_ids": sorted(incomplete_ids),
"resolved_ids": sorted(resolved_ids),
"unresolved_ids": sorted(unresolved_ids),
"error_ids": sorted(error_ids),
"aggregate_metrics": {
"total_cost_usd": total_cost_usd,
"total_prompt_tokens": total_prompt_tokens,
"total_completion_tokens": total_completion_tokens,
},
}

with open(output_file, "w", encoding="utf-8") as outfile:
json.dump(report, outfile, indent=4)

logger.info("Harbor report generated at %s", output_file)
logger.info(
"Resolved %d/%d completed instances", len(resolved_ids), len(completed_ids)
)
return report


def main() -> None:
parser = argparse.ArgumentParser(description="Process generic Harbor output.jsonl")
parser.add_argument("input_file", help="Path to Harbor-converted output.jsonl")
parser.add_argument("--output-file", help="Output report JSON path")
args = parser.parse_args()

input_file = Path(args.input_file)
if not input_file.exists():
logger.error("Input file does not exist: %s", input_file)
sys.exit(1)

output_file = (
Path(args.output_file)
if args.output_file
else input_file.with_suffix(".report.json")
)
try:
process_harbor_results(str(input_file), str(output_file))
generate_cost_report(str(input_file))
except Exception as exc:
logger.error("Harbor evaluation failed: %s", exc)
sys.exit(1)

try:
LaminarService.get().update_evaluation_scores(str(input_file), str(output_file))
except Exception as exc:
logger.warning("Laminar telemetry reporting failed (non-fatal): %s", exc)

print(json.dumps({"report_json": str(output_file)}))


if __name__ == "__main__":
main()
Loading
Loading