navis-web-ad / scripts /evaluate.py
aditsg
P2
49d5e37
"""Run Navis Web benchmark modes and emit evaluation artifacts."""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from inference import run_benchmark_comparison
from navis_web_env.reporting import write_evaluation_artifacts
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Evaluate Navis Web benchmark modes.")
parser.add_argument(
"--modes",
nargs="+",
default=["heuristic", "oracle"],
help="Agent modes to benchmark.",
)
parser.add_argument(
"--output-dir",
default="outputs/evals",
help="Directory where report files should be written.",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
comparison = run_benchmark_comparison(args.modes)
output_paths = write_evaluation_artifacts(comparison, Path(args.output_dir))
for label, path in output_paths.items():
print(f"[REPORT] {label}={path}", flush=True)
if __name__ == "__main__":
main()