dream-s1k-demo / test_regress_full.sh
况兑
eval: greedy decode + numeric strict; system: force full decimals; regressions: A/B/C/noisy
e45d7fc
set -euo pipefail
AD=./runs/overfit10_gold
for D in subset10.numeric.jsonl subset10.perturbed.chat.jsonl subset10.perturbed.chat.norm.jsonl subset10.noisy.chat.jsonl
do
echo "==> $D"
python eval_simple.py --adapter "$AD" --data "$D"
done
echo "OK: full regression passed."