| { | |
| "best_global_step": 900, | |
| "best_metric": 0.3843807876110077, | |
| "best_model_checkpoint": "./payment-extractor-optimized/checkpoint-900", | |
| "epoch": 4.391676866585067, | |
| "eval_steps": 100, | |
| "global_step": 900, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.608897936344147, | |
| "epoch": 0.04895960832313342, | |
| "grad_norm": 3.6901261806488037, | |
| "learning_rate": 3.6000000000000003e-06, | |
| "loss": 3.7818, | |
| "mean_token_accuracy": 0.4824275210499763, | |
| "num_tokens": 38801.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 2.6241641879081725, | |
| "epoch": 0.09791921664626684, | |
| "grad_norm": 3.3132846355438232, | |
| "learning_rate": 7.600000000000001e-06, | |
| "loss": 3.8589, | |
| "mean_token_accuracy": 0.4720219738781452, | |
| "num_tokens": 69778.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 2.4173066079616548, | |
| "epoch": 0.14687882496940025, | |
| "grad_norm": 3.6249752044677734, | |
| "learning_rate": 1.16e-05, | |
| "loss": 3.3912, | |
| "mean_token_accuracy": 0.5040820851922035, | |
| "num_tokens": 96530.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 2.40450359582901, | |
| "epoch": 0.19583843329253367, | |
| "grad_norm": 2.8998804092407227, | |
| "learning_rate": 1.5600000000000003e-05, | |
| "loss": 2.8447, | |
| "mean_token_accuracy": 0.5414798364043236, | |
| "num_tokens": 120256.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 2.284638965129852, | |
| "epoch": 0.24479804161566707, | |
| "grad_norm": 3.2823739051818848, | |
| "learning_rate": 1.9600000000000002e-05, | |
| "loss": 1.994, | |
| "mean_token_accuracy": 0.6505917206406593, | |
| "num_tokens": 142198.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 2.4689642548561097, | |
| "epoch": 0.2937576499388005, | |
| "grad_norm": 2.1198339462280273, | |
| "learning_rate": 1.9995795492789368e-05, | |
| "loss": 2.407, | |
| "mean_token_accuracy": 0.6436298653483391, | |
| "num_tokens": 182368.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 1.9469897836446761, | |
| "epoch": 0.3427172582619339, | |
| "grad_norm": 1.6485199928283691, | |
| "learning_rate": 1.9981265932877486e-05, | |
| "loss": 1.822, | |
| "mean_token_accuracy": 0.7381727218627929, | |
| "num_tokens": 213403.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 1.2083469659090043, | |
| "epoch": 0.39167686658506734, | |
| "grad_norm": 1.6277607679367065, | |
| "learning_rate": 1.995637449278864e-05, | |
| "loss": 1.1033, | |
| "mean_token_accuracy": 0.8396818682551384, | |
| "num_tokens": 239775.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.8737979799509048, | |
| "epoch": 0.44063647490820074, | |
| "grad_norm": 1.428305745124817, | |
| "learning_rate": 1.9921147013144782e-05, | |
| "loss": 0.819, | |
| "mean_token_accuracy": 0.8934117943048477, | |
| "num_tokens": 263407.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.5950189992785454, | |
| "epoch": 0.48959608323133413, | |
| "grad_norm": 1.842503547668457, | |
| "learning_rate": 1.98756200647502e-05, | |
| "loss": 0.5144, | |
| "mean_token_accuracy": 0.9327697649598121, | |
| "num_tokens": 285288.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.48959608323133413, | |
| "eval_entropy": 1.026704891055238, | |
| "eval_loss": 1.094792127609253, | |
| "eval_mean_token_accuracy": 0.8390047088557598, | |
| "eval_num_tokens": 285288.0, | |
| "eval_runtime": 21.2516, | |
| "eval_samples_per_second": 19.199, | |
| "eval_steps_per_second": 4.8, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 1.7036595433950423, | |
| "epoch": 0.5385556915544676, | |
| "grad_norm": 1.3429725170135498, | |
| "learning_rate": 1.9819840910626174e-05, | |
| "loss": 1.8116, | |
| "mean_token_accuracy": 0.7263100475072861, | |
| "num_tokens": 325823.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 1.3686422437429429, | |
| "epoch": 0.587515299877601, | |
| "grad_norm": 1.5084576606750488, | |
| "learning_rate": 1.9753867456945653e-05, | |
| "loss": 1.2567, | |
| "mean_token_accuracy": 0.8075757145881652, | |
| "num_tokens": 356947.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.9899184465408325, | |
| "epoch": 0.6364749082007344, | |
| "grad_norm": 1.1403902769088745, | |
| "learning_rate": 1.9677768192918973e-05, | |
| "loss": 0.9506, | |
| "mean_token_accuracy": 0.8574481099843979, | |
| "num_tokens": 384860.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.6585730135440826, | |
| "epoch": 0.6854345165238678, | |
| "grad_norm": 1.179526925086975, | |
| "learning_rate": 1.9591622119692953e-05, | |
| "loss": 0.6207, | |
| "mean_token_accuracy": 0.9152766704559326, | |
| "num_tokens": 408668.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.49890025779604913, | |
| "epoch": 0.7343941248470012, | |
| "grad_norm": 2.6344473361968994, | |
| "learning_rate": 1.9495518668337204e-05, | |
| "loss": 0.4502, | |
| "mean_token_accuracy": 0.9399106308817864, | |
| "num_tokens": 430716.0, | |
| "step": 150 | |
| }, | |
| { | |
| "entropy": 1.4263419717550279, | |
| "epoch": 0.7833537331701347, | |
| "grad_norm": 1.8926013708114624, | |
| "learning_rate": 1.9389557607002808e-05, | |
| "loss": 1.4959, | |
| "mean_token_accuracy": 0.7735099658370018, | |
| "num_tokens": 468866.0, | |
| "step": 160 | |
| }, | |
| { | |
| "entropy": 1.0953898191452027, | |
| "epoch": 0.8323133414932681, | |
| "grad_norm": 0.972065806388855, | |
| "learning_rate": 1.9273848937349712e-05, | |
| "loss": 0.9593, | |
| "mean_token_accuracy": 0.8498499140143394, | |
| "num_tokens": 499808.0, | |
| "step": 170 | |
| }, | |
| { | |
| "entropy": 0.7438366547226906, | |
| "epoch": 0.8812729498164015, | |
| "grad_norm": 1.1401448249816895, | |
| "learning_rate": 1.9148512780350384e-05, | |
| "loss": 0.7447, | |
| "mean_token_accuracy": 0.885815504193306, | |
| "num_tokens": 526616.0, | |
| "step": 180 | |
| }, | |
| { | |
| "entropy": 0.5698909372091293, | |
| "epoch": 0.9302325581395349, | |
| "grad_norm": 1.2068321704864502, | |
| "learning_rate": 1.9013679251588304e-05, | |
| "loss": 0.5473, | |
| "mean_token_accuracy": 0.9253638312220573, | |
| "num_tokens": 550407.0, | |
| "step": 190 | |
| }, | |
| { | |
| "entropy": 0.44005816280841825, | |
| "epoch": 0.9791921664626683, | |
| "grad_norm": 2.358927011489868, | |
| "learning_rate": 1.8869488326180682e-05, | |
| "loss": 0.4075, | |
| "mean_token_accuracy": 0.9496751800179482, | |
| "num_tokens": 572396.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9791921664626683, | |
| "eval_entropy": 0.7869388329632142, | |
| "eval_loss": 0.8091182112693787, | |
| "eval_mean_token_accuracy": 0.8807467134559855, | |
| "eval_num_tokens": 572396.0, | |
| "eval_runtime": 21.2312, | |
| "eval_samples_per_second": 19.217, | |
| "eval_steps_per_second": 4.804, | |
| "step": 200 | |
| }, | |
| { | |
| "entropy": 1.2279899853306848, | |
| "epoch": 1.0244798041615668, | |
| "grad_norm": 2.1428945064544678, | |
| "learning_rate": 1.8716089693465696e-05, | |
| "loss": 1.1313, | |
| "mean_token_accuracy": 0.8170017583950145, | |
| "num_tokens": 606225.0, | |
| "step": 210 | |
| }, | |
| { | |
| "entropy": 1.1651067942380906, | |
| "epoch": 1.0734394124847002, | |
| "grad_norm": 1.0429226160049438, | |
| "learning_rate": 1.855364260160507e-05, | |
| "loss": 1.0415, | |
| "mean_token_accuracy": 0.8387726441025734, | |
| "num_tokens": 639876.0, | |
| "step": 220 | |
| }, | |
| { | |
| "entropy": 0.8667519375681877, | |
| "epoch": 1.1223990208078336, | |
| "grad_norm": 1.605554461479187, | |
| "learning_rate": 1.8382315692263324e-05, | |
| "loss": 0.8179, | |
| "mean_token_accuracy": 0.8734943136572838, | |
| "num_tokens": 669254.0, | |
| "step": 230 | |
| }, | |
| { | |
| "entropy": 0.5002034060657025, | |
| "epoch": 1.171358629130967, | |
| "grad_norm": 1.766509771347046, | |
| "learning_rate": 1.820228682553533e-05, | |
| "loss": 0.5005, | |
| "mean_token_accuracy": 0.9262110635638237, | |
| "num_tokens": 693566.0, | |
| "step": 240 | |
| }, | |
| { | |
| "entropy": 0.4824553743004799, | |
| "epoch": 1.2203182374541004, | |
| "grad_norm": 1.3010213375091553, | |
| "learning_rate": 1.8013742895303883e-05, | |
| "loss": 0.4382, | |
| "mean_token_accuracy": 0.9407104596495628, | |
| "num_tokens": 716324.0, | |
| "step": 250 | |
| }, | |
| { | |
| "entropy": 0.886764119565487, | |
| "epoch": 1.2692778457772338, | |
| "grad_norm": 2.592423915863037, | |
| "learning_rate": 1.7816879635219028e-05, | |
| "loss": 0.8545, | |
| "mean_token_accuracy": 0.8653567478060722, | |
| "num_tokens": 749252.0, | |
| "step": 260 | |
| }, | |
| { | |
| "entropy": 0.9997715935111046, | |
| "epoch": 1.3182374541003672, | |
| "grad_norm": 1.1456091403961182, | |
| "learning_rate": 1.7611901415500536e-05, | |
| "loss": 0.9247, | |
| "mean_token_accuracy": 0.8514459431171417, | |
| "num_tokens": 782122.0, | |
| "step": 270 | |
| }, | |
| { | |
| "entropy": 0.8610424906015396, | |
| "epoch": 1.3671970624235006, | |
| "grad_norm": 1.3674919605255127, | |
| "learning_rate": 1.7399021030774443e-05, | |
| "loss": 0.7975, | |
| "mean_token_accuracy": 0.8724059730768203, | |
| "num_tokens": 812066.0, | |
| "step": 280 | |
| }, | |
| { | |
| "entropy": 0.4803216062486172, | |
| "epoch": 1.416156670746634, | |
| "grad_norm": 1.0078914165496826, | |
| "learning_rate": 1.717845947916398e-05, | |
| "loss": 0.4703, | |
| "mean_token_accuracy": 0.9251162573695183, | |
| "num_tokens": 836705.0, | |
| "step": 290 | |
| }, | |
| { | |
| "entropy": 0.477826127409935, | |
| "epoch": 1.4651162790697674, | |
| "grad_norm": 1.0581692457199097, | |
| "learning_rate": 1.695044573286413e-05, | |
| "loss": 0.4365, | |
| "mean_token_accuracy": 0.9394737258553505, | |
| "num_tokens": 859600.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.4651162790697674, | |
| "eval_entropy": 0.7299714935760871, | |
| "eval_loss": 0.6802113056182861, | |
| "eval_mean_token_accuracy": 0.8902294659147075, | |
| "eval_num_tokens": 859600.0, | |
| "eval_runtime": 21.2433, | |
| "eval_samples_per_second": 19.206, | |
| "eval_steps_per_second": 4.802, | |
| "step": 300 | |
| }, | |
| { | |
| "entropy": 0.8674551732838154, | |
| "epoch": 1.514075887392901, | |
| "grad_norm": 2.246056079864502, | |
| "learning_rate": 1.6715216500438093e-05, | |
| "loss": 0.7837, | |
| "mean_token_accuracy": 0.874087019264698, | |
| "num_tokens": 891961.0, | |
| "step": 310 | |
| }, | |
| { | |
| "entropy": 0.9596038445830345, | |
| "epoch": 1.5630354957160342, | |
| "grad_norm": 1.619541883468628, | |
| "learning_rate": 1.647301598108234e-05, | |
| "loss": 0.8646, | |
| "mean_token_accuracy": 0.857255433499813, | |
| "num_tokens": 924600.0, | |
| "step": 320 | |
| }, | |
| { | |
| "entropy": 0.7986739322543144, | |
| "epoch": 1.6119951040391678, | |
| "grad_norm": 2.2831315994262695, | |
| "learning_rate": 1.6224095611115385e-05, | |
| "loss": 0.7373, | |
| "mean_token_accuracy": 0.8797152638435364, | |
| "num_tokens": 954438.0, | |
| "step": 330 | |
| }, | |
| { | |
| "entropy": 0.47184758856892584, | |
| "epoch": 1.660954712362301, | |
| "grad_norm": 2.6723692417144775, | |
| "learning_rate": 1.596871380295351e-05, | |
| "loss": 0.4218, | |
| "mean_token_accuracy": 0.9275540292263031, | |
| "num_tokens": 979015.0, | |
| "step": 340 | |
| }, | |
| { | |
| "entropy": 0.45431587770581244, | |
| "epoch": 1.7099143206854346, | |
| "grad_norm": 2.8102011680603027, | |
| "learning_rate": 1.570713567684432e-05, | |
| "loss": 0.3684, | |
| "mean_token_accuracy": 0.9440548986196518, | |
| "num_tokens": 1001989.0, | |
| "step": 350 | |
| }, | |
| { | |
| "entropy": 0.7829712487757206, | |
| "epoch": 1.758873929008568, | |
| "grad_norm": 1.7116206884384155, | |
| "learning_rate": 1.5439632785636707e-05, | |
| "loss": 0.663, | |
| "mean_token_accuracy": 0.8908370733261108, | |
| "num_tokens": 1034477.0, | |
| "step": 360 | |
| }, | |
| { | |
| "entropy": 0.8193635582923889, | |
| "epoch": 1.8078335373317014, | |
| "grad_norm": 0.9935292601585388, | |
| "learning_rate": 1.5166482832872923e-05, | |
| "loss": 0.7506, | |
| "mean_token_accuracy": 0.8711828157305718, | |
| "num_tokens": 1067534.0, | |
| "step": 370 | |
| }, | |
| { | |
| "entropy": 0.7036949403584003, | |
| "epoch": 1.8567931456548348, | |
| "grad_norm": 1.7203794717788696, | |
| "learning_rate": 1.4887969384495403e-05, | |
| "loss": 0.6792, | |
| "mean_token_accuracy": 0.88548723757267, | |
| "num_tokens": 1097280.0, | |
| "step": 380 | |
| }, | |
| { | |
| "entropy": 0.3680045209825039, | |
| "epoch": 1.9057527539779682, | |
| "grad_norm": 1.4336254596710205, | |
| "learning_rate": 1.4604381574467616e-05, | |
| "loss": 0.3249, | |
| "mean_token_accuracy": 0.9348379656672478, | |
| "num_tokens": 1121920.0, | |
| "step": 390 | |
| }, | |
| { | |
| "entropy": 0.37416374161839483, | |
| "epoch": 1.9547123623011016, | |
| "grad_norm": 1.0702754259109497, | |
| "learning_rate": 1.4316013804614644e-05, | |
| "loss": 0.2929, | |
| "mean_token_accuracy": 0.9463742494583129, | |
| "num_tokens": 1145002.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.9547123623011016, | |
| "eval_entropy": 0.6033922105151064, | |
| "eval_loss": 0.49456480145454407, | |
| "eval_mean_token_accuracy": 0.9079791880121418, | |
| "eval_num_tokens": 1145002.0, | |
| "eval_runtime": 21.1773, | |
| "eval_samples_per_second": 19.266, | |
| "eval_steps_per_second": 4.816, | |
| "step": 400 | |
| }, | |
| { | |
| "entropy": 0.41346504196927353, | |
| "epoch": 2.0, | |
| "grad_norm": 0.9998394250869751, | |
| "learning_rate": 1.4023165438994933e-05, | |
| "loss": 0.3152, | |
| "mean_token_accuracy": 0.9379849611101924, | |
| "num_tokens": 1167618.0, | |
| "step": 410 | |
| }, | |
| { | |
| "entropy": 1.0350674405694007, | |
| "epoch": 2.0489596083231336, | |
| "grad_norm": 1.5349912643432617, | |
| "learning_rate": 1.3726140493120639e-05, | |
| "loss": 0.8548, | |
| "mean_token_accuracy": 0.8497318252921104, | |
| "num_tokens": 1208219.0, | |
| "step": 420 | |
| }, | |
| { | |
| "entropy": 0.6850787699222565, | |
| "epoch": 2.097919216646267, | |
| "grad_norm": 0.9396725296974182, | |
| "learning_rate": 1.3425247318349137e-05, | |
| "loss": 0.6238, | |
| "mean_token_accuracy": 0.8850513309240341, | |
| "num_tokens": 1239429.0, | |
| "step": 430 | |
| }, | |
| { | |
| "entropy": 0.5261640004813671, | |
| "epoch": 2.1468788249694004, | |
| "grad_norm": 1.1289340257644653, | |
| "learning_rate": 1.3120798281773346e-05, | |
| "loss": 0.4823, | |
| "mean_token_accuracy": 0.9071076571941376, | |
| "num_tokens": 1266803.0, | |
| "step": 440 | |
| }, | |
| { | |
| "entropy": 0.3201193898916245, | |
| "epoch": 2.1958384332925336, | |
| "grad_norm": 0.8056601285934448, | |
| "learning_rate": 1.2813109441943166e-05, | |
| "loss": 0.2848, | |
| "mean_token_accuracy": 0.943960489332676, | |
| "num_tokens": 1290548.0, | |
| "step": 450 | |
| }, | |
| { | |
| "entropy": 0.24066586568951606, | |
| "epoch": 2.244798041615667, | |
| "grad_norm": 1.4606080055236816, | |
| "learning_rate": 1.2502500220754736e-05, | |
| "loss": 0.198, | |
| "mean_token_accuracy": 0.9593303337693214, | |
| "num_tokens": 1312508.0, | |
| "step": 460 | |
| }, | |
| { | |
| "entropy": 0.9151711657643318, | |
| "epoch": 2.2937576499388004, | |
| "grad_norm": 1.4147366285324097, | |
| "learning_rate": 1.2189293071848051e-05, | |
| "loss": 0.7861, | |
| "mean_token_accuracy": 0.8564901977777482, | |
| "num_tokens": 1351246.0, | |
| "step": 470 | |
| }, | |
| { | |
| "entropy": 0.6647965341806412, | |
| "epoch": 2.342717258261934, | |
| "grad_norm": 0.8306525349617004, | |
| "learning_rate": 1.187381314585725e-05, | |
| "loss": 0.6222, | |
| "mean_token_accuracy": 0.8851410359144211, | |
| "num_tokens": 1382196.0, | |
| "step": 480 | |
| }, | |
| { | |
| "entropy": 0.4626566760241985, | |
| "epoch": 2.391676866585067, | |
| "grad_norm": 1.2030415534973145, | |
| "learning_rate": 1.1556387952861036e-05, | |
| "loss": 0.4374, | |
| "mean_token_accuracy": 0.9146721437573433, | |
| "num_tokens": 1409050.0, | |
| "step": 490 | |
| }, | |
| { | |
| "entropy": 0.292323163151741, | |
| "epoch": 2.440636474908201, | |
| "grad_norm": 0.6598659753799438, | |
| "learning_rate": 1.1237347022383747e-05, | |
| "loss": 0.2684, | |
| "mean_token_accuracy": 0.9467211216688156, | |
| "num_tokens": 1432781.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.440636474908201, | |
| "eval_entropy": 0.5070152823247162, | |
| "eval_loss": 0.4397956132888794, | |
| "eval_mean_token_accuracy": 0.9159132575287539, | |
| "eval_num_tokens": 1432781.0, | |
| "eval_runtime": 21.252, | |
| "eval_samples_per_second": 19.198, | |
| "eval_steps_per_second": 4.8, | |
| "step": 500 | |
| }, | |
| { | |
| "entropy": 0.22589576356112956, | |
| "epoch": 2.489596083231334, | |
| "grad_norm": 0.9443103075027466, | |
| "learning_rate": 1.0917021561299864e-05, | |
| "loss": 0.191, | |
| "mean_token_accuracy": 0.9594075292348861, | |
| "num_tokens": 1454735.0, | |
| "step": 510 | |
| }, | |
| { | |
| "entropy": 0.8820951759815217, | |
| "epoch": 2.5385556915544676, | |
| "grad_norm": 1.3829594850540161, | |
| "learning_rate": 1.0595744109997326e-05, | |
| "loss": 0.7404, | |
| "mean_token_accuracy": 0.8678841248154641, | |
| "num_tokens": 1494090.0, | |
| "step": 520 | |
| }, | |
| { | |
| "entropy": 0.6331574842333794, | |
| "epoch": 2.5875152998776008, | |
| "grad_norm": 1.056781530380249, | |
| "learning_rate": 1.0273848197156401e-05, | |
| "loss": 0.5978, | |
| "mean_token_accuracy": 0.888825386762619, | |
| "num_tokens": 1525062.0, | |
| "step": 530 | |
| }, | |
| { | |
| "entropy": 0.4361429732292891, | |
| "epoch": 2.6364749082007344, | |
| "grad_norm": 1.1021668910980225, | |
| "learning_rate": 9.951667993502599e-06, | |
| "loss": 0.3999, | |
| "mean_token_accuracy": 0.9207724243402481, | |
| "num_tokens": 1551813.0, | |
| "step": 540 | |
| }, | |
| { | |
| "entropy": 0.2877490069717169, | |
| "epoch": 2.685434516523868, | |
| "grad_norm": 0.7195802927017212, | |
| "learning_rate": 9.629537964893063e-06, | |
| "loss": 0.2665, | |
| "mean_token_accuracy": 0.9467098742723465, | |
| "num_tokens": 1575433.0, | |
| "step": 550 | |
| }, | |
| { | |
| "entropy": 0.2189638450741768, | |
| "epoch": 2.734394124847001, | |
| "grad_norm": 1.2461501359939575, | |
| "learning_rate": 9.307792525096582e-06, | |
| "loss": 0.1899, | |
| "mean_token_accuracy": 0.9602582737803459, | |
| "num_tokens": 1597306.0, | |
| "step": 560 | |
| }, | |
| { | |
| "entropy": 0.825089768320322, | |
| "epoch": 2.783353733170135, | |
| "grad_norm": 1.1732733249664307, | |
| "learning_rate": 8.986765688627652e-06, | |
| "loss": 0.6871, | |
| "mean_token_accuracy": 0.8730710700154305, | |
| "num_tokens": 1636454.0, | |
| "step": 570 | |
| }, | |
| { | |
| "entropy": 0.6250899910926819, | |
| "epoch": 2.832313341493268, | |
| "grad_norm": 0.8510639071464539, | |
| "learning_rate": 8.666790723995043e-06, | |
| "loss": 0.597, | |
| "mean_token_accuracy": 0.8902681171894073, | |
| "num_tokens": 1667449.0, | |
| "step": 580 | |
| }, | |
| { | |
| "entropy": 0.4255227465182543, | |
| "epoch": 2.8812729498164016, | |
| "grad_norm": 0.7777973413467407, | |
| "learning_rate": 8.348199807724806e-06, | |
| "loss": 0.3826, | |
| "mean_token_accuracy": 0.921813291311264, | |
| "num_tokens": 1694019.0, | |
| "step": 590 | |
| }, | |
| { | |
| "entropy": 0.280773538723588, | |
| "epoch": 2.9302325581395348, | |
| "grad_norm": 0.7043775916099548, | |
| "learning_rate": 8.0313236795169e-06, | |
| "loss": 0.2658, | |
| "mean_token_accuracy": 0.9461780115962029, | |
| "num_tokens": 1717763.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.9302325581395348, | |
| "eval_entropy": 0.4671570363582349, | |
| "eval_loss": 0.41600170731544495, | |
| "eval_mean_token_accuracy": 0.919863361938327, | |
| "eval_num_tokens": 1717763.0, | |
| "eval_runtime": 21.0643, | |
| "eval_samples_per_second": 19.369, | |
| "eval_steps_per_second": 4.842, | |
| "step": 600 | |
| }, | |
| { | |
| "entropy": 0.22502716928720473, | |
| "epoch": 2.9791921664626684, | |
| "grad_norm": 1.245797872543335, | |
| "learning_rate": 7.716491298893443e-06, | |
| "loss": 0.2014, | |
| "mean_token_accuracy": 0.9587241023778915, | |
| "num_tokens": 1739700.0, | |
| "step": 610 | |
| }, | |
| { | |
| "entropy": 0.7121120212045876, | |
| "epoch": 3.0244798041615666, | |
| "grad_norm": 1.578238606452942, | |
| "learning_rate": 7.404029503695028e-06, | |
| "loss": 0.592, | |
| "mean_token_accuracy": 0.884317248254209, | |
| "num_tokens": 1773279.0, | |
| "step": 620 | |
| }, | |
| { | |
| "entropy": 0.6201752804219722, | |
| "epoch": 3.0734394124847, | |
| "grad_norm": 0.9049448370933533, | |
| "learning_rate": 7.094262670779611e-06, | |
| "loss": 0.5563, | |
| "mean_token_accuracy": 0.8980106115341187, | |
| "num_tokens": 1806614.0, | |
| "step": 630 | |
| }, | |
| { | |
| "entropy": 0.5976772040128708, | |
| "epoch": 3.1223990208078334, | |
| "grad_norm": 1.2108434438705444, | |
| "learning_rate": 6.78751237927623e-06, | |
| "loss": 0.5527, | |
| "mean_token_accuracy": 0.898520989716053, | |
| "num_tokens": 1836366.0, | |
| "step": 640 | |
| }, | |
| { | |
| "entropy": 0.26630178317427633, | |
| "epoch": 3.171358629130967, | |
| "grad_norm": 0.9259095788002014, | |
| "learning_rate": 6.48409707674317e-06, | |
| "loss": 0.2551, | |
| "mean_token_accuracy": 0.9447213500738144, | |
| "num_tokens": 1860872.0, | |
| "step": 650 | |
| }, | |
| { | |
| "entropy": 0.2778078857809305, | |
| "epoch": 3.2203182374541, | |
| "grad_norm": 1.1952687501907349, | |
| "learning_rate": 6.18433174857705e-06, | |
| "loss": 0.2529, | |
| "mean_token_accuracy": 0.9497045069932938, | |
| "num_tokens": 1883874.0, | |
| "step": 660 | |
| }, | |
| { | |
| "entropy": 0.5209752138704061, | |
| "epoch": 3.269277845777234, | |
| "grad_norm": 1.9098657369613647, | |
| "learning_rate": 5.8885275910161574e-06, | |
| "loss": 0.4378, | |
| "mean_token_accuracy": 0.9142355337738991, | |
| "num_tokens": 1917087.0, | |
| "step": 670 | |
| }, | |
| { | |
| "entropy": 0.64239452034235, | |
| "epoch": 3.318237454100367, | |
| "grad_norm": 0.8732991814613342, | |
| "learning_rate": 5.596991688077409e-06, | |
| "loss": 0.5759, | |
| "mean_token_accuracy": 0.8934120118618012, | |
| "num_tokens": 1949777.0, | |
| "step": 680 | |
| }, | |
| { | |
| "entropy": 0.5809963166713714, | |
| "epoch": 3.3671970624235006, | |
| "grad_norm": 0.8436909317970276, | |
| "learning_rate": 5.310026692762316e-06, | |
| "loss": 0.5335, | |
| "mean_token_accuracy": 0.8995566830039025, | |
| "num_tokens": 1978937.0, | |
| "step": 690 | |
| }, | |
| { | |
| "entropy": 0.2611778501421213, | |
| "epoch": 3.416156670746634, | |
| "grad_norm": 1.1178691387176514, | |
| "learning_rate": 5.027930512862976e-06, | |
| "loss": 0.2472, | |
| "mean_token_accuracy": 0.9472135573625564, | |
| "num_tokens": 2003325.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 3.416156670746634, | |
| "eval_entropy": 0.4348171895333365, | |
| "eval_loss": 0.39616772532463074, | |
| "eval_mean_token_accuracy": 0.9233323300586027, | |
| "eval_num_tokens": 2003325.0, | |
| "eval_runtime": 21.154, | |
| "eval_samples_per_second": 19.287, | |
| "eval_steps_per_second": 4.822, | |
| "step": 700 | |
| }, | |
| { | |
| "entropy": 0.25852978341281413, | |
| "epoch": 3.4651162790697674, | |
| "grad_norm": 1.280714511871338, | |
| "learning_rate": 4.750996001694215e-06, | |
| "loss": 0.2374, | |
| "mean_token_accuracy": 0.9529847070574761, | |
| "num_tokens": 2026113.0, | |
| "step": 710 | |
| }, | |
| { | |
| "entropy": 0.5000423431396485, | |
| "epoch": 3.514075887392901, | |
| "grad_norm": 1.6330279111862183, | |
| "learning_rate": 4.479510654072909e-06, | |
| "loss": 0.4167, | |
| "mean_token_accuracy": 0.9191933527588845, | |
| "num_tokens": 2058740.0, | |
| "step": 720 | |
| }, | |
| { | |
| "entropy": 0.5997287526726722, | |
| "epoch": 3.563035495716034, | |
| "grad_norm": 0.8560138940811157, | |
| "learning_rate": 4.213756307860175e-06, | |
| "loss": 0.5358, | |
| "mean_token_accuracy": 0.8984866350889206, | |
| "num_tokens": 2091634.0, | |
| "step": 730 | |
| }, | |
| { | |
| "entropy": 0.5770318634808064, | |
| "epoch": 3.611995104039168, | |
| "grad_norm": 1.2353603839874268, | |
| "learning_rate": 3.954008851376252e-06, | |
| "loss": 0.5377, | |
| "mean_token_accuracy": 0.9002507776021957, | |
| "num_tokens": 2121380.0, | |
| "step": 740 | |
| }, | |
| { | |
| "entropy": 0.26896645687520504, | |
| "epoch": 3.660954712362301, | |
| "grad_norm": 0.5919920802116394, | |
| "learning_rate": 3.700537936991733e-06, | |
| "loss": 0.2541, | |
| "mean_token_accuracy": 0.9460319548845291, | |
| "num_tokens": 2145937.0, | |
| "step": 750 | |
| }, | |
| { | |
| "entropy": 0.27403030432760717, | |
| "epoch": 3.7099143206854346, | |
| "grad_norm": 0.9070358276367188, | |
| "learning_rate": 3.4536067011925945e-06, | |
| "loss": 0.2555, | |
| "mean_token_accuracy": 0.9493874981999397, | |
| "num_tokens": 2168886.0, | |
| "step": 760 | |
| }, | |
| { | |
| "entropy": 0.49704274982213975, | |
| "epoch": 3.758873929008568, | |
| "grad_norm": 1.7363125085830688, | |
| "learning_rate": 3.213471491409568e-06, | |
| "loss": 0.42, | |
| "mean_token_accuracy": 0.9207542642951012, | |
| "num_tokens": 2201618.0, | |
| "step": 770 | |
| }, | |
| { | |
| "entropy": 0.6181983411312103, | |
| "epoch": 3.8078335373317014, | |
| "grad_norm": 0.8889790773391724, | |
| "learning_rate": 2.9803815998954334e-06, | |
| "loss": 0.5448, | |
| "mean_token_accuracy": 0.8981723725795746, | |
| "num_tokens": 2234716.0, | |
| "step": 780 | |
| }, | |
| { | |
| "entropy": 0.5966475777328014, | |
| "epoch": 3.8567931456548346, | |
| "grad_norm": 1.2056671380996704, | |
| "learning_rate": 2.7545790049265506e-06, | |
| "loss": 0.5489, | |
| "mean_token_accuracy": 0.8978816866874695, | |
| "num_tokens": 2264636.0, | |
| "step": 790 | |
| }, | |
| { | |
| "entropy": 0.2779835805296898, | |
| "epoch": 3.905752753977968, | |
| "grad_norm": 0.9485862851142883, | |
| "learning_rate": 2.5362981195972627e-06, | |
| "loss": 0.2623, | |
| "mean_token_accuracy": 0.9445345297455787, | |
| "num_tokens": 2289186.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 3.905752753977968, | |
| "eval_entropy": 0.42678100907919453, | |
| "eval_loss": 0.3873916268348694, | |
| "eval_mean_token_accuracy": 0.9257115113968942, | |
| "eval_num_tokens": 2289186.0, | |
| "eval_runtime": 21.0498, | |
| "eval_samples_per_second": 19.383, | |
| "eval_steps_per_second": 4.846, | |
| "step": 800 | |
| }, | |
| { | |
| "entropy": 0.26332913562655447, | |
| "epoch": 3.954712362301102, | |
| "grad_norm": 0.8345707058906555, | |
| "learning_rate": 2.3257655484679376e-06, | |
| "loss": 0.2408, | |
| "mean_token_accuracy": 0.9522591263055802, | |
| "num_tokens": 2312074.0, | |
| "step": 810 | |
| }, | |
| { | |
| "entropy": 0.31254331364824967, | |
| "epoch": 4.0, | |
| "grad_norm": 1.0298808813095093, | |
| "learning_rate": 2.123199852319352e-06, | |
| "loss": 0.2697, | |
| "mean_token_accuracy": 0.9450424829044858, | |
| "num_tokens": 2335236.0, | |
| "step": 820 | |
| }, | |
| { | |
| "entropy": 0.6749883458018303, | |
| "epoch": 4.048959608323133, | |
| "grad_norm": 0.9510061740875244, | |
| "learning_rate": 1.9288113212575454e-06, | |
| "loss": 0.5564, | |
| "mean_token_accuracy": 0.8976946637034416, | |
| "num_tokens": 2374400.0, | |
| "step": 830 | |
| }, | |
| { | |
| "entropy": 0.5979305505752563, | |
| "epoch": 4.097919216646267, | |
| "grad_norm": 0.8506686687469482, | |
| "learning_rate": 1.7428017564047594e-06, | |
| "loss": 0.562, | |
| "mean_token_accuracy": 0.8973473310470581, | |
| "num_tokens": 2405288.0, | |
| "step": 840 | |
| }, | |
| { | |
| "entropy": 0.38541266694664955, | |
| "epoch": 4.1468788249694, | |
| "grad_norm": 0.8283617496490479, | |
| "learning_rate": 1.565364260403055e-06, | |
| "loss": 0.3559, | |
| "mean_token_accuracy": 0.9307585805654526, | |
| "num_tokens": 2431701.0, | |
| "step": 850 | |
| }, | |
| { | |
| "entropy": 0.26541123948991296, | |
| "epoch": 4.195838433292534, | |
| "grad_norm": 0.6910677552223206, | |
| "learning_rate": 1.3966830369481231e-06, | |
| "loss": 0.2495, | |
| "mean_token_accuracy": 0.9491465017199516, | |
| "num_tokens": 2455249.0, | |
| "step": 860 | |
| }, | |
| { | |
| "entropy": 0.19118896164000035, | |
| "epoch": 4.244798041615667, | |
| "grad_norm": 0.9370400905609131, | |
| "learning_rate": 1.2369331995613664e-06, | |
| "loss": 0.1709, | |
| "mean_token_accuracy": 0.9622136607766152, | |
| "num_tokens": 2477038.0, | |
| "step": 870 | |
| }, | |
| { | |
| "entropy": 0.7027731344103814, | |
| "epoch": 4.293757649938801, | |
| "grad_norm": 1.2064703702926636, | |
| "learning_rate": 1.0862805897987894e-06, | |
| "loss": 0.5842, | |
| "mean_token_accuracy": 0.8925257995724678, | |
| "num_tokens": 2516496.0, | |
| "step": 880 | |
| }, | |
| { | |
| "entropy": 0.5964450292289257, | |
| "epoch": 4.342717258261934, | |
| "grad_norm": 1.0601837635040283, | |
| "learning_rate": 9.448816050854559e-07, | |
| "loss": 0.5577, | |
| "mean_token_accuracy": 0.8974330082535744, | |
| "num_tokens": 2547663.0, | |
| "step": 890 | |
| }, | |
| { | |
| "entropy": 0.4454402156174183, | |
| "epoch": 4.391676866585067, | |
| "grad_norm": 0.7408022880554199, | |
| "learning_rate": 8.128830363541574e-07, | |
| "loss": 0.4126, | |
| "mean_token_accuracy": 0.9200813502073288, | |
| "num_tokens": 2575037.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 4.391676866585067, | |
| "eval_entropy": 0.4241247632924248, | |
| "eval_loss": 0.3843807876110077, | |
| "eval_mean_token_accuracy": 0.9258733800813264, | |
| "eval_num_tokens": 2575037.0, | |
| "eval_runtime": 21.0482, | |
| "eval_samples_per_second": 19.384, | |
| "eval_steps_per_second": 4.846, | |
| "step": 900 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 1025, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.525760757970944e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |