BrowserForge / sft_data.py
cryptodarth's picture
V1
42d1599
"""Build SFT JSONL data from replay trajectories."""
from __future__ import annotations
import argparse
import json
try:
from .slm_policy import replay_to_sft_rows, write_sft_jsonl
except ImportError: # pragma: no cover
from slm_policy import replay_to_sft_rows, write_sft_jsonl
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--replay", nargs="+", default=["artifacts/trajectories.jsonl"])
parser.add_argument("--output", default="artifacts/sft/browser_actions.jsonl")
parser.add_argument(
"--include-failures",
action="store_true",
help="Include failed episodes. By default only successful episodes are exported.",
)
parser.add_argument(
"--include-noop",
action="store_true",
help="Include noop actions. By default noop/recovery actions are excluded from SFT.",
)
args = parser.parse_args()
excluded = ("ask_oracle",) if args.include_noop else ("ask_oracle", "noop")
rows = replay_to_sft_rows(
args.replay,
success_only=not args.include_failures,
excluded_action_types=excluded,
)
write_sft_jsonl(rows, args.output)
print(json.dumps({"output": args.output, "rows": len(rows)}, indent=2))
if __name__ == "__main__":
main()