{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1428, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007002801120448179, "grad_norm": 101.72705078125, "learning_rate": 0.0, "logits/chosen": -4.4921875, "logits/rejected": -4.50390625, "logps/chosen": -367.25, "logps/rejected": -333.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0035014005602240898, "grad_norm": 33.1419792175293, "learning_rate": 2.797202797202797e-08, "logits/chosen": -4.5009765625, "logits/rejected": -4.5283203125, "logps/chosen": -337.3125, "logps/rejected": -297.53125, "loss": 0.6915, "rewards/accuracies": 0.109375, "rewards/chosen": -0.0002932548522949219, "rewards/margins": 0.0015611648559570312, "rewards/rejected": -0.0018558502197265625, "step": 5 }, { "epoch": 0.0070028011204481795, "grad_norm": 76.48152923583984, "learning_rate": 6.293706293706294e-08, "logits/chosen": -4.491406440734863, "logits/rejected": -4.510937690734863, "logps/chosen": -303.79998779296875, "logps/rejected": -276.4125061035156, "loss": 0.692, "rewards/accuracies": 0.16875000298023224, "rewards/chosen": -0.01181716937571764, "rewards/margins": 0.002262115478515625, "rewards/rejected": -0.01406936626881361, "step": 10 }, { "epoch": 0.01050420168067227, "grad_norm": 90.04021453857422, "learning_rate": 9.79020979020979e-08, "logits/chosen": -4.504687309265137, "logits/rejected": -4.514843940734863, "logps/chosen": -339.0249938964844, "logps/rejected": -315.04998779296875, "loss": 0.6924, "rewards/accuracies": 0.125, "rewards/chosen": -0.009069060906767845, "rewards/margins": 0.00015182494826149195, "rewards/rejected": -0.00923309288918972, "step": 15 }, { "epoch": 0.014005602240896359, "grad_norm": 72.611328125, "learning_rate": 1.3286713286713285e-07, "logits/chosen": -4.508593559265137, "logits/rejected": -4.518750190734863, "logps/chosen": -296.70001220703125, "logps/rejected": -268.76251220703125, "loss": 0.6978, "rewards/accuracies": 0.10000000149011612, "rewards/chosen": -0.014070892706513405, "rewards/margins": -0.010562133975327015, "rewards/rejected": -0.00351715087890625, "step": 20 }, { "epoch": 0.01750700280112045, "grad_norm": 70.4524154663086, "learning_rate": 1.6783216783216782e-07, "logits/chosen": -4.490624904632568, "logits/rejected": -4.521874904632568, "logps/chosen": -312.20001220703125, "logps/rejected": -287.0, "loss": 0.6873, "rewards/accuracies": 0.20624999701976776, "rewards/chosen": -0.008679961785674095, "rewards/margins": 0.011355971917510033, "rewards/rejected": -0.020025253295898438, "step": 25 }, { "epoch": 0.02100840336134454, "grad_norm": 181.83139038085938, "learning_rate": 2.0279720279720277e-07, "logits/chosen": -4.515625, "logits/rejected": -4.525000095367432, "logps/chosen": -310.3500061035156, "logps/rejected": -285.75, "loss": 0.6951, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": -0.05675353854894638, "rewards/margins": -0.0030269622802734375, "rewards/rejected": -0.05369110032916069, "step": 30 }, { "epoch": 0.024509803921568627, "grad_norm": 40.653568267822266, "learning_rate": 2.3776223776223774e-07, "logits/chosen": -4.502343654632568, "logits/rejected": -4.51953125, "logps/chosen": -326.46875, "logps/rejected": -295.5625, "loss": 0.6799, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.1183624267578125, "rewards/margins": 0.030280303210020065, "rewards/rejected": -0.14861373603343964, "step": 35 }, { "epoch": 0.028011204481792718, "grad_norm": 46.50446319580078, "learning_rate": 2.727272727272727e-07, "logits/chosen": -4.517968654632568, "logits/rejected": -4.546875, "logps/chosen": -309.45001220703125, "logps/rejected": -290.3125, "loss": 0.6777, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.12889710068702698, "rewards/margins": 0.03403778001666069, "rewards/rejected": -0.16286087036132812, "step": 40 }, { "epoch": 0.031512605042016806, "grad_norm": 14.87103271484375, "learning_rate": 3.076923076923077e-07, "logits/chosen": -4.514062404632568, "logits/rejected": -4.534375190734863, "logps/chosen": -309.04998779296875, "logps/rejected": -279.04998779296875, "loss": 0.6592, "rewards/accuracies": 0.53125, "rewards/chosen": -0.171966552734375, "rewards/margins": 0.07354736328125, "rewards/rejected": -0.24545899033546448, "step": 45 }, { "epoch": 0.0350140056022409, "grad_norm": 47.90538787841797, "learning_rate": 3.4265734265734264e-07, "logits/chosen": -4.51953125, "logits/rejected": -4.529687404632568, "logps/chosen": -346.1499938964844, "logps/rejected": -314.5249938964844, "loss": 0.6184, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2255096435546875, "rewards/margins": 0.16802291572093964, "rewards/rejected": -0.3935233950614929, "step": 50 }, { "epoch": 0.03851540616246499, "grad_norm": 57.09840393066406, "learning_rate": 3.776223776223776e-07, "logits/chosen": -4.512499809265137, "logits/rejected": -4.538281440734863, "logps/chosen": -336.625, "logps/rejected": -307.9375, "loss": 0.6265, "rewards/accuracies": 0.625, "rewards/chosen": -0.2501220703125, "rewards/margins": 0.1536407470703125, "rewards/rejected": -0.40345460176467896, "step": 55 }, { "epoch": 0.04201680672268908, "grad_norm": 54.38421630859375, "learning_rate": 4.125874125874126e-07, "logits/chosen": -4.521093845367432, "logits/rejected": -4.532812595367432, "logps/chosen": -333.17498779296875, "logps/rejected": -307.3374938964844, "loss": 0.5915, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.29718321561813354, "rewards/margins": 0.2501785159111023, "rewards/rejected": -0.5469970703125, "step": 60 }, { "epoch": 0.04551820728291316, "grad_norm": 46.930301666259766, "learning_rate": 4.4755244755244753e-07, "logits/chosen": -4.560937404632568, "logits/rejected": -4.564843654632568, "logps/chosen": -313.95001220703125, "logps/rejected": -288.82501220703125, "loss": 0.5812, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.456216424703598, "rewards/margins": 0.2818649411201477, "rewards/rejected": -0.738232433795929, "step": 65 }, { "epoch": 0.049019607843137254, "grad_norm": 16.46210289001465, "learning_rate": 4.825174825174824e-07, "logits/chosen": -4.526562690734863, "logits/rejected": -4.547656059265137, "logps/chosen": -353.45001220703125, "logps/rejected": -322.0249938964844, "loss": 0.5431, "rewards/accuracies": 0.75, "rewards/chosen": -0.6093078851699829, "rewards/margins": 0.3897949159145355, "rewards/rejected": -0.998486340045929, "step": 70 }, { "epoch": 0.052521008403361345, "grad_norm": 83.04356384277344, "learning_rate": 5.174825174825175e-07, "logits/chosen": -4.528124809265137, "logits/rejected": -4.564843654632568, "logps/chosen": -292.9937438964844, "logps/rejected": -269.10626220703125, "loss": 0.551, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.2913818359375, "rewards/margins": 0.39362794160842896, "rewards/rejected": -0.6848999261856079, "step": 75 }, { "epoch": 0.056022408963585436, "grad_norm": 123.90628051757812, "learning_rate": 5.524475524475523e-07, "logits/chosen": -4.520312309265137, "logits/rejected": -4.55078125, "logps/chosen": -322.54998779296875, "logps/rejected": -302.98748779296875, "loss": 0.4953, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.559588611125946, "rewards/margins": 0.5720565915107727, "rewards/rejected": -1.1309814453125, "step": 80 }, { "epoch": 0.05952380952380952, "grad_norm": 26.677682876586914, "learning_rate": 5.874125874125873e-07, "logits/chosen": -4.529687404632568, "logits/rejected": -4.536718845367432, "logps/chosen": -340.875, "logps/rejected": -318.42498779296875, "loss": 0.4645, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.4416137635707855, "rewards/margins": 0.73309326171875, "rewards/rejected": -1.17474365234375, "step": 85 }, { "epoch": 0.06302521008403361, "grad_norm": 34.868038177490234, "learning_rate": 6.223776223776223e-07, "logits/chosen": -4.514062404632568, "logits/rejected": -4.541406154632568, "logps/chosen": -324.17498779296875, "logps/rejected": -306.07501220703125, "loss": 0.4807, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.4429565370082855, "rewards/margins": 0.7114837765693665, "rewards/rejected": -1.1531493663787842, "step": 90 }, { "epoch": 0.0665266106442577, "grad_norm": 111.43302917480469, "learning_rate": 6.573426573426572e-07, "logits/chosen": -4.5234375, "logits/rejected": -4.536718845367432, "logps/chosen": -304.7749938964844, "logps/rejected": -286.75, "loss": 0.4656, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.11838226020336151, "rewards/margins": 0.7753235101699829, "rewards/rejected": -0.893786609172821, "step": 95 }, { "epoch": 0.0700280112044818, "grad_norm": 7.1454339027404785, "learning_rate": 6.923076923076922e-07, "logits/chosen": -4.491406440734863, "logits/rejected": -4.510937690734863, "logps/chosen": -318.8500061035156, "logps/rejected": -301.1000061035156, "loss": 0.4015, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.0882415771484375, "rewards/margins": 1.081781029701233, "rewards/rejected": -1.1699950695037842, "step": 100 }, { "epoch": 0.07352941176470588, "grad_norm": 56.80259323120117, "learning_rate": 7.272727272727272e-07, "logits/chosen": -4.460156440734863, "logits/rejected": -4.479687690734863, "logps/chosen": -303.9375, "logps/rejected": -283.2124938964844, "loss": 0.445, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.15115967392921448, "rewards/margins": 1.0308959484100342, "rewards/rejected": -0.881054699420929, "step": 105 }, { "epoch": 0.07703081232492998, "grad_norm": 210.96456909179688, "learning_rate": 7.622377622377621e-07, "logits/chosen": -4.453906059265137, "logits/rejected": -4.459374904632568, "logps/chosen": -301.26251220703125, "logps/rejected": -282.79376220703125, "loss": 0.4522, "rewards/accuracies": 0.84375, "rewards/chosen": 0.21079406142234802, "rewards/margins": 1.071386694908142, "rewards/rejected": -0.86181640625, "step": 110 }, { "epoch": 0.08053221288515407, "grad_norm": 11.06592845916748, "learning_rate": 7.972027972027971e-07, "logits/chosen": -4.440625190734863, "logits/rejected": -4.447656154632568, "logps/chosen": -330.0249938964844, "logps/rejected": -319.625, "loss": 0.3711, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.9296020269393921, "rewards/margins": 1.491845726966858, "rewards/rejected": -0.562487781047821, "step": 115 }, { "epoch": 0.08403361344537816, "grad_norm": 105.50016021728516, "learning_rate": 8.321678321678321e-07, "logits/chosen": -4.4296875, "logits/rejected": -4.431250095367432, "logps/chosen": -315.7875061035156, "logps/rejected": -306.2562561035156, "loss": 0.3411, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.582104504108429, "rewards/margins": 1.7448608875274658, "rewards/rejected": -1.1632782220840454, "step": 120 }, { "epoch": 0.08753501400560224, "grad_norm": 168.21710205078125, "learning_rate": 8.67132867132867e-07, "logits/chosen": -4.440625190734863, "logits/rejected": -4.448437690734863, "logps/chosen": -295.2749938964844, "logps/rejected": -295.04998779296875, "loss": 0.406, "rewards/accuracies": 0.875, "rewards/chosen": 0.5245513916015625, "rewards/margins": 1.5556640625, "rewards/rejected": -1.031103491783142, "step": 125 }, { "epoch": 0.09103641456582633, "grad_norm": 10.8776273727417, "learning_rate": 9.02097902097902e-07, "logits/chosen": -4.44140625, "logits/rejected": -4.44140625, "logps/chosen": -325.4624938964844, "logps/rejected": -317.51251220703125, "loss": 0.4175, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.46117860078811646, "rewards/margins": 1.69061279296875, "rewards/rejected": -2.151049852371216, "step": 130 }, { "epoch": 0.09453781512605042, "grad_norm": 7.193403720855713, "learning_rate": 9.37062937062937e-07, "logits/chosen": -4.432031154632568, "logits/rejected": -4.435937404632568, "logps/chosen": -355.67498779296875, "logps/rejected": -344.32501220703125, "loss": 0.306, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.3448729515075684, "rewards/margins": 1.7757079601287842, "rewards/rejected": -4.119833469390869, "step": 135 }, { "epoch": 0.09803921568627451, "grad_norm": 9.44430160522461, "learning_rate": 9.72027972027972e-07, "logits/chosen": -4.416406154632568, "logits/rejected": -4.400781154632568, "logps/chosen": -320.125, "logps/rejected": -314.9750061035156, "loss": 0.3724, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6501343250274658, "rewards/margins": 1.635766625404358, "rewards/rejected": -3.285888671875, "step": 140 }, { "epoch": 0.1015406162464986, "grad_norm": 104.80125427246094, "learning_rate": 9.999985057155316e-07, "logits/chosen": -4.379687309265137, "logits/rejected": -4.364843845367432, "logps/chosen": -315.95001220703125, "logps/rejected": -321.0249938964844, "loss": 0.2703, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.275909423828125, "rewards/margins": 2.2874755859375, "rewards/rejected": -2.5631346702575684, "step": 145 }, { "epoch": 0.10504201680672269, "grad_norm": 8.149246215820312, "learning_rate": 9.999462066969451e-07, "logits/chosen": -4.30859375, "logits/rejected": -4.300000190734863, "logps/chosen": -317.2875061035156, "logps/rejected": -309.7250061035156, "loss": 0.2573, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.4537109434604645, "rewards/margins": 2.4706053733825684, "rewards/rejected": -2.9227538108825684, "step": 150 }, { "epoch": 0.10854341736694678, "grad_norm": 10.2169828414917, "learning_rate": 9.998192023862448e-07, "logits/chosen": -4.253125190734863, "logits/rejected": -4.235156059265137, "logps/chosen": -298.79998779296875, "logps/rejected": -311.4375, "loss": 0.3356, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.627392590045929, "rewards/margins": 2.5416016578674316, "rewards/rejected": -1.9140136241912842, "step": 155 }, { "epoch": 0.11204481792717087, "grad_norm": 6.131630897521973, "learning_rate": 9.996175117612607e-07, "logits/chosen": -4.217187404632568, "logits/rejected": -4.154687404632568, "logps/chosen": -319.70001220703125, "logps/rejected": -337.875, "loss": 0.2292, "rewards/accuracies": 0.90625, "rewards/chosen": 0.4034667909145355, "rewards/margins": 3.26318359375, "rewards/rejected": -2.8581299781799316, "step": 160 }, { "epoch": 0.11554621848739496, "grad_norm": 26.314456939697266, "learning_rate": 9.993411649599492e-07, "logits/chosen": -4.126562595367432, "logits/rejected": -4.089062690734863, "logps/chosen": -338.70001220703125, "logps/rejected": -337.17498779296875, "loss": 0.3577, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.941699206829071, "rewards/margins": 2.522167921066284, "rewards/rejected": -3.4608397483825684, "step": 165 }, { "epoch": 0.11904761904761904, "grad_norm": 297.8920593261719, "learning_rate": 9.989902032758903e-07, "logits/chosen": -4.151953220367432, "logits/rejected": -4.126172065734863, "logps/chosen": -325.40625, "logps/rejected": -334.625, "loss": 0.2709, "rewards/accuracies": 0.90625, "rewards/chosen": -1.824884057044983, "rewards/margins": 2.510986328125, "rewards/rejected": -4.337841987609863, "step": 170 }, { "epoch": 0.12254901960784313, "grad_norm": 38.77955627441406, "learning_rate": 9.985646791521163e-07, "logits/chosen": -4.196875095367432, "logits/rejected": -4.171875, "logps/chosen": -307.32501220703125, "logps/rejected": -299.70001220703125, "loss": 0.2857, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.518969714641571, "rewards/margins": 2.327831983566284, "rewards/rejected": -2.846923828125, "step": 175 }, { "epoch": 0.12605042016806722, "grad_norm": 9.513249397277832, "learning_rate": 9.980646561732758e-07, "logits/chosen": -4.165625095367432, "logits/rejected": -4.1328125, "logps/chosen": -342.9624938964844, "logps/rejected": -336.9624938964844, "loss": 0.279, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.4823059141635895, "rewards/margins": 2.8280272483825684, "rewards/rejected": -3.3097167015075684, "step": 180 }, { "epoch": 0.12955182072829133, "grad_norm": 8.950021743774414, "learning_rate": 9.97490209056133e-07, "logits/chosen": -4.09765625, "logits/rejected": -4.087500095367432, "logps/chosen": -324.20001220703125, "logps/rejected": -332.0, "loss": 0.2007, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9912109375, "rewards/margins": 3.622851610183716, "rewards/rejected": -2.6298828125, "step": 185 }, { "epoch": 0.1330532212885154, "grad_norm": 95.99122619628906, "learning_rate": 9.968414236384021e-07, "logits/chosen": -4.088671684265137, "logits/rejected": -4.05859375, "logps/chosen": -323.01873779296875, "logps/rejected": -342.1875, "loss": 0.2168, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 1.2775146961212158, "rewards/margins": 4.678320407867432, "rewards/rejected": -3.3967041969299316, "step": 190 }, { "epoch": 0.13655462184873948, "grad_norm": 23.21297264099121, "learning_rate": 9.961183968659215e-07, "logits/chosen": -4.132031440734863, "logits/rejected": -4.114843845367432, "logps/chosen": -315.6312561035156, "logps/rejected": -333.41876220703125, "loss": 0.2552, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0270264148712158, "rewards/margins": 3.6083006858825684, "rewards/rejected": -4.63623046875, "step": 195 }, { "epoch": 0.1400560224089636, "grad_norm": 11.13497543334961, "learning_rate": 9.953212367781675e-07, "logits/chosen": -4.210156440734863, "logits/rejected": -4.153906345367432, "logps/chosen": -329.70001220703125, "logps/rejected": -343.25, "loss": 0.2498, "rewards/accuracies": 0.90625, "rewards/chosen": -2.5098876953125, "rewards/margins": 3.6513671875, "rewards/rejected": -6.163769721984863, "step": 200 }, { "epoch": 0.14355742296918766, "grad_norm": 9.842167854309082, "learning_rate": 9.944500624921093e-07, "logits/chosen": -4.247656345367432, "logits/rejected": -4.170312404632568, "logps/chosen": -308.3999938964844, "logps/rejected": -328.42498779296875, "loss": 0.1851, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.574206531047821, "rewards/margins": 4.539453029632568, "rewards/rejected": -5.109606742858887, "step": 205 }, { "epoch": 0.14705882352941177, "grad_norm": 11.150321960449219, "learning_rate": 9.93505004184412e-07, "logits/chosen": -4.275781154632568, "logits/rejected": -4.194531440734863, "logps/chosen": -294.6000061035156, "logps/rejected": -299.57501220703125, "loss": 0.2046, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.0366578102111816, "rewards/margins": 3.9019532203674316, "rewards/rejected": -0.867999255657196, "step": 210 }, { "epoch": 0.15056022408963585, "grad_norm": 11.823946952819824, "learning_rate": 9.92486203071982e-07, "logits/chosen": -4.1875, "logits/rejected": -4.145312309265137, "logps/chosen": -262.70623779296875, "logps/rejected": -274.20001220703125, "loss": 0.2582, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 3.4903321266174316, "rewards/margins": 3.1060547828674316, "rewards/rejected": 0.3824706971645355, "step": 215 }, { "epoch": 0.15406162464985995, "grad_norm": 4.554815769195557, "learning_rate": 9.913938113908674e-07, "logits/chosen": -4.1796875, "logits/rejected": -4.120312690734863, "logps/chosen": -282.3374938964844, "logps/rejected": -290.6187438964844, "loss": 0.2069, "rewards/accuracies": 0.9375, "rewards/chosen": 1.898474097251892, "rewards/margins": 3.626757860183716, "rewards/rejected": -1.730224609375, "step": 220 }, { "epoch": 0.15756302521008403, "grad_norm": 9.618704795837402, "learning_rate": 9.902279923735093e-07, "logits/chosen": -4.1328125, "logits/rejected": -4.083203315734863, "logps/chosen": -307.48126220703125, "logps/rejected": -318.1875, "loss": 0.1849, "rewards/accuracies": 0.9375, "rewards/chosen": -0.22895507514476776, "rewards/margins": 3.7847657203674316, "rewards/rejected": -4.01904296875, "step": 225 }, { "epoch": 0.16106442577030813, "grad_norm": 14.65978717803955, "learning_rate": 9.8898892022435e-07, "logits/chosen": -4.105859279632568, "logits/rejected": -4.065625190734863, "logps/chosen": -344.84375, "logps/rejected": -358.01251220703125, "loss": 0.212, "rewards/accuracies": 0.9375, "rewards/chosen": -2.134960889816284, "rewards/margins": 3.317578077316284, "rewards/rejected": -5.451562404632568, "step": 230 }, { "epoch": 0.1645658263305322, "grad_norm": 14.46033000946045, "learning_rate": 9.876767800938031e-07, "logits/chosen": -4.133984565734863, "logits/rejected": -4.099218845367432, "logps/chosen": -331.66876220703125, "logps/rejected": -340.7124938964844, "loss": 0.1346, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.249926805496216, "rewards/margins": 3.540234327316284, "rewards/rejected": -5.790820121765137, "step": 235 }, { "epoch": 0.16806722689075632, "grad_norm": 41.85720443725586, "learning_rate": 9.862917680505863e-07, "logits/chosen": -4.169531345367432, "logits/rejected": -4.102343559265137, "logps/chosen": -294.1499938964844, "logps/rejected": -309.17498779296875, "loss": 0.1258, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.0751953125, "rewards/margins": 4.217187404632568, "rewards/rejected": -4.292138576507568, "step": 240 }, { "epoch": 0.1715686274509804, "grad_norm": 42.87565612792969, "learning_rate": 9.848340910524241e-07, "logits/chosen": -4.21484375, "logits/rejected": -4.102343559265137, "logps/chosen": -311.0, "logps/rejected": -335.1499938964844, "loss": 0.1509, "rewards/accuracies": 0.9375, "rewards/chosen": 1.9837768077850342, "rewards/margins": 4.403222560882568, "rewards/rejected": -2.420605421066284, "step": 245 }, { "epoch": 0.17507002801120447, "grad_norm": 10.537090301513672, "learning_rate": 9.833039669151225e-07, "logits/chosen": -4.223437309265137, "logits/rejected": -4.108202934265137, "logps/chosen": -293.51251220703125, "logps/rejected": -308.17498779296875, "loss": 0.2425, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 2.3416991233825684, "rewards/margins": 4.234765529632568, "rewards/rejected": -1.894873023033142, "step": 250 }, { "epoch": 0.17857142857142858, "grad_norm": 2.457155466079712, "learning_rate": 9.817016242800215e-07, "logits/chosen": -4.33984375, "logits/rejected": -4.192187309265137, "logps/chosen": -278.23126220703125, "logps/rejected": -295.4375, "loss": 0.1635, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 2.135058641433716, "rewards/margins": 4.299609184265137, "rewards/rejected": -2.1654295921325684, "step": 255 }, { "epoch": 0.18207282913165265, "grad_norm": 14.015923500061035, "learning_rate": 9.8002730257983e-07, "logits/chosen": -4.373437404632568, "logits/rejected": -4.24609375, "logps/chosen": -283.6812438964844, "logps/rejected": -301.4624938964844, "loss": 0.239, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 2.7750000953674316, "rewards/margins": 3.8597655296325684, "rewards/rejected": -1.085168480873108, "step": 260 }, { "epoch": 0.18557422969187676, "grad_norm": 6.20159912109375, "learning_rate": 9.782812520028486e-07, "logits/chosen": -4.380468845367432, "logits/rejected": -4.271093845367432, "logps/chosen": -313.67498779296875, "logps/rejected": -328.0249938964844, "loss": 0.173, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.6710448265075684, "rewards/margins": 4.1572265625, "rewards/rejected": -1.4860107898712158, "step": 265 }, { "epoch": 0.18907563025210083, "grad_norm": 8.501385688781738, "learning_rate": 9.764637334555838e-07, "logits/chosen": -4.25, "logits/rejected": -4.155859470367432, "logps/chosen": -323.3500061035156, "logps/rejected": -346.7749938964844, "loss": 0.1794, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.511059582233429, "rewards/margins": 4.514257907867432, "rewards/rejected": -4.004687309265137, "step": 270 }, { "epoch": 0.19257703081232494, "grad_norm": 43.63261413574219, "learning_rate": 9.74575018523763e-07, "logits/chosen": -4.196093559265137, "logits/rejected": -4.1171875, "logps/chosen": -347.2749938964844, "logps/rejected": -358.70001220703125, "loss": 0.1583, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.863964855670929, "rewards/margins": 4.191601753234863, "rewards/rejected": -5.051171779632568, "step": 275 }, { "epoch": 0.19607843137254902, "grad_norm": 3.3350625038146973, "learning_rate": 9.726153894317505e-07, "logits/chosen": -4.134375095367432, "logits/rejected": -4.066015720367432, "logps/chosen": -347.7250061035156, "logps/rejected": -371.6499938964844, "loss": 0.1537, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.461084008216858, "rewards/margins": 4.367383003234863, "rewards/rejected": -5.829492092132568, "step": 280 }, { "epoch": 0.19957983193277312, "grad_norm": 8.582822799682617, "learning_rate": 9.705851390003783e-07, "logits/chosen": -4.100781440734863, "logits/rejected": -4.035937309265137, "logps/chosen": -333.8500061035156, "logps/rejected": -351.57501220703125, "loss": 0.1489, "rewards/accuracies": 0.9375, "rewards/chosen": -2.152880907058716, "rewards/margins": 4.238379001617432, "rewards/rejected": -6.391992092132568, "step": 285 }, { "epoch": 0.2030812324929972, "grad_norm": 6.626759052276611, "learning_rate": 9.684845706031877e-07, "logits/chosen": -4.0234375, "logits/rejected": -3.9957032203674316, "logps/chosen": -337.75, "logps/rejected": -357.95001220703125, "loss": 0.1771, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.648266553878784, "rewards/margins": 4.245703220367432, "rewards/rejected": -6.89453125, "step": 290 }, { "epoch": 0.20658263305322128, "grad_norm": 8.172663688659668, "learning_rate": 9.663139981210998e-07, "logits/chosen": -4.048047065734863, "logits/rejected": -4.009765625, "logps/chosen": -322.5, "logps/rejected": -356.04998779296875, "loss": 0.0956, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.319116234779358, "rewards/margins": 4.673632621765137, "rewards/rejected": -5.993750095367432, "step": 295 }, { "epoch": 0.21008403361344538, "grad_norm": 7.659239292144775, "learning_rate": 9.640737458955118e-07, "logits/chosen": -3.9984374046325684, "logits/rejected": -3.946093797683716, "logps/chosen": -307.3687438964844, "logps/rejected": -328.86248779296875, "loss": 0.1718, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.5625640749931335, "rewards/margins": 4.830859184265137, "rewards/rejected": -5.391699314117432, "step": 300 }, { "epoch": 0.21358543417366946, "grad_norm": 15.881430625915527, "learning_rate": 9.61764148679833e-07, "logits/chosen": -4.079687595367432, "logits/rejected": -3.9722657203674316, "logps/chosen": -293.5249938964844, "logps/rejected": -323.17498779296875, "loss": 0.1267, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.255767822265625, "rewards/margins": 5.384765625, "rewards/rejected": -5.129296779632568, "step": 305 }, { "epoch": 0.21708683473389356, "grad_norm": 3.689927577972412, "learning_rate": 9.59385551589462e-07, "logits/chosen": -4.171093940734863, "logits/rejected": -4.001172065734863, "logps/chosen": -309.2250061035156, "logps/rejected": -344.17498779296875, "loss": 0.1433, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.39042967557907104, "rewards/margins": 5.873437404632568, "rewards/rejected": -6.268750190734863, "step": 310 }, { "epoch": 0.22058823529411764, "grad_norm": 3.1176366806030273, "learning_rate": 9.56938310050219e-07, "logits/chosen": -4.214453220367432, "logits/rejected": -4.013671875, "logps/chosen": -304.75, "logps/rejected": -351.7875061035156, "loss": 0.0886, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.46104735136032104, "rewards/margins": 6.175000190734863, "rewards/rejected": -6.635546684265137, "step": 315 }, { "epoch": 0.22408963585434175, "grad_norm": 10.508804321289062, "learning_rate": 9.544227897452347e-07, "logits/chosen": -4.287890434265137, "logits/rejected": -4.046093940734863, "logps/chosen": -321.46875, "logps/rejected": -362.0249938964844, "loss": 0.2045, "rewards/accuracies": 0.9375, "rewards/chosen": 0.84326171875, "rewards/margins": 6.593945503234863, "rewards/rejected": -5.749804496765137, "step": 320 }, { "epoch": 0.22759103641456582, "grad_norm": 8.315167427062988, "learning_rate": 9.518393665603082e-07, "logits/chosen": -4.189453125, "logits/rejected": -4.031640529632568, "logps/chosen": -314.29376220703125, "logps/rejected": -360.70001220703125, "loss": 0.1578, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.0939452648162842, "rewards/margins": 6.074999809265137, "rewards/rejected": -7.168359279632568, "step": 325 }, { "epoch": 0.23109243697478993, "grad_norm": 7.695666313171387, "learning_rate": 9.491884265277382e-07, "logits/chosen": -4.165625095367432, "logits/rejected": -3.9828124046325684, "logps/chosen": -356.6000061035156, "logps/rejected": -392.125, "loss": 0.1062, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.137158155441284, "rewards/margins": 5.859667778015137, "rewards/rejected": -8.993749618530273, "step": 330 }, { "epoch": 0.234593837535014, "grad_norm": 29.099950790405273, "learning_rate": 9.46470365768641e-07, "logits/chosen": -4.122656345367432, "logits/rejected": -3.9632811546325684, "logps/chosen": -353.7124938964844, "logps/rejected": -384.63751220703125, "loss": 0.1972, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.113037109375, "rewards/margins": 5.382031440734863, "rewards/rejected": -9.497655868530273, "step": 335 }, { "epoch": 0.23809523809523808, "grad_norm": 8.61391830444336, "learning_rate": 9.436855904337594e-07, "logits/chosen": -4.144140720367432, "logits/rejected": -3.9921875, "logps/chosen": -359.67498779296875, "logps/rejected": -398.4750061035156, "loss": 0.1049, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.8545289039611816, "rewards/margins": 5.109375, "rewards/rejected": -8.96484375, "step": 340 }, { "epoch": 0.2415966386554622, "grad_norm": 14.49962329864502, "learning_rate": 9.408345166427718e-07, "logits/chosen": -4.134375095367432, "logits/rejected": -3.9613280296325684, "logps/chosen": -325.67498779296875, "logps/rejected": -355.8125, "loss": 0.2243, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.363330125808716, "rewards/margins": 5.4833984375, "rewards/rejected": -7.849609375, "step": 345 }, { "epoch": 0.24509803921568626, "grad_norm": 8.390877723693848, "learning_rate": 9.379175704221138e-07, "logits/chosen": -4.162109375, "logits/rejected": -3.991015672683716, "logps/chosen": -287.53125, "logps/rejected": -323.08123779296875, "loss": 0.1182, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.650317370891571, "rewards/margins": 5.601953029632568, "rewards/rejected": -6.25146484375, "step": 350 }, { "epoch": 0.24859943977591037, "grad_norm": 12.179805755615234, "learning_rate": 9.349351876413181e-07, "logits/chosen": -4.13671875, "logits/rejected": -3.953906297683716, "logps/chosen": -319.70001220703125, "logps/rejected": -360.36248779296875, "loss": 0.1587, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.6496093273162842, "rewards/margins": 5.811327934265137, "rewards/rejected": -7.461718559265137, "step": 355 }, { "epoch": 0.25210084033613445, "grad_norm": 11.038627624511719, "learning_rate": 9.318878139478839e-07, "logits/chosen": -4.061718940734863, "logits/rejected": -3.8882813453674316, "logps/chosen": -318.4375, "logps/rejected": -355.6875, "loss": 0.1205, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.5353882312774658, "rewards/margins": 5.686181545257568, "rewards/rejected": -7.225390434265137, "step": 360 }, { "epoch": 0.2556022408963585, "grad_norm": 12.718750953674316, "learning_rate": 9.287759047006859e-07, "logits/chosen": -4.049609184265137, "logits/rejected": -3.8804688453674316, "logps/chosen": -322.20001220703125, "logps/rejected": -350.9750061035156, "loss": 0.1692, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.41533201932907104, "rewards/margins": 6.026074409484863, "rewards/rejected": -6.441210746765137, "step": 365 }, { "epoch": 0.25910364145658266, "grad_norm": 6.676199436187744, "learning_rate": 9.255999249019307e-07, "logits/chosen": -3.9546875953674316, "logits/rejected": -3.853515625, "logps/chosen": -289.6875, "logps/rejected": -326.875, "loss": 0.1589, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5786621570587158, "rewards/margins": 6.227734565734863, "rewards/rejected": -4.654510498046875, "step": 370 }, { "epoch": 0.26260504201680673, "grad_norm": 2.6601064205169678, "learning_rate": 9.223603491276733e-07, "logits/chosen": -3.9593749046325684, "logits/rejected": -3.8558592796325684, "logps/chosen": -318.0, "logps/rejected": -365.1000061035156, "loss": 0.1186, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2627441883087158, "rewards/margins": 7.131640434265137, "rewards/rejected": -5.866894721984863, "step": 375 }, { "epoch": 0.2661064425770308, "grad_norm": 10.011348724365234, "learning_rate": 9.190576614569034e-07, "logits/chosen": -4.017968654632568, "logits/rejected": -3.8695311546325684, "logps/chosen": -323.7250061035156, "logps/rejected": -368.2250061035156, "loss": 0.1091, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.39854127168655396, "rewards/margins": 7.003125190734863, "rewards/rejected": -7.404687404632568, "step": 380 }, { "epoch": 0.2696078431372549, "grad_norm": 39.08020782470703, "learning_rate": 9.156923553992106e-07, "logits/chosen": -4.040625095367432, "logits/rejected": -3.8871092796325684, "logps/chosen": -312.4750061035156, "logps/rejected": -364.0874938964844, "loss": 0.132, "rewards/accuracies": 0.9375, "rewards/chosen": 1.1413085460662842, "rewards/margins": 7.5625, "rewards/rejected": -6.409375190734863, "step": 385 }, { "epoch": 0.27310924369747897, "grad_norm": 2.079184055328369, "learning_rate": 9.122649338210406e-07, "logits/chosen": -4.075390815734863, "logits/rejected": -3.9078125953674316, "logps/chosen": -310.21875, "logps/rejected": -352.7250061035156, "loss": 0.084, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4845214784145355, "rewards/margins": 7.5390625, "rewards/rejected": -7.056250095367432, "step": 390 }, { "epoch": 0.2766106442577031, "grad_norm": 12.645612716674805, "learning_rate": 9.08775908870554e-07, "logits/chosen": -4.032422065734863, "logits/rejected": -3.889843702316284, "logps/chosen": -321.88751220703125, "logps/rejected": -366.32501220703125, "loss": 0.1464, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.23764649033546448, "rewards/margins": 7.212109565734863, "rewards/rejected": -6.974804878234863, "step": 395 }, { "epoch": 0.2801120448179272, "grad_norm": 13.877970695495605, "learning_rate": 9.052258019010979e-07, "logits/chosen": -4.080078125, "logits/rejected": -3.911328077316284, "logps/chosen": -324.6499938964844, "logps/rejected": -367.5249938964844, "loss": 0.1189, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.22805175185203552, "rewards/margins": 6.795117378234863, "rewards/rejected": -6.571093559265137, "step": 400 }, { "epoch": 0.28361344537815125, "grad_norm": 8.056689262390137, "learning_rate": 9.01615143393301e-07, "logits/chosen": -4.070703029632568, "logits/rejected": -3.923828125, "logps/chosen": -303.82501220703125, "logps/rejected": -351.625, "loss": 0.0839, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.4512085020542145, "rewards/margins": 6.696875095367432, "rewards/rejected": -7.1455078125, "step": 405 }, { "epoch": 0.28711484593837533, "grad_norm": 6.637094497680664, "learning_rate": 8.979444728758065e-07, "logits/chosen": -4.039453029632568, "logits/rejected": -3.895312547683716, "logps/chosen": -315.0, "logps/rejected": -365.5, "loss": 0.1436, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.4109863340854645, "rewards/margins": 6.589453220367432, "rewards/rejected": -6.17919921875, "step": 410 }, { "epoch": 0.29061624649859946, "grad_norm": 1.8166749477386475, "learning_rate": 8.942143388446521e-07, "logits/chosen": -4.03125, "logits/rejected": -3.833203077316284, "logps/chosen": -353.0, "logps/rejected": -384.3999938964844, "loss": 0.1075, "rewards/accuracies": 0.96875, "rewards/chosen": -0.49619752168655396, "rewards/margins": 7.074999809265137, "rewards/rejected": -7.573828220367432, "step": 415 }, { "epoch": 0.29411764705882354, "grad_norm": 5.399685382843018, "learning_rate": 8.90425298681309e-07, "logits/chosen": -3.992968797683716, "logits/rejected": -3.8003907203674316, "logps/chosen": -317.8999938964844, "logps/rejected": -366.23748779296875, "loss": 0.1127, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.502575695514679, "rewards/margins": 7.261523246765137, "rewards/rejected": -6.760644435882568, "step": 420 }, { "epoch": 0.2976190476190476, "grad_norm": 49.38500213623047, "learning_rate": 8.865779185693957e-07, "logits/chosen": -3.990234375, "logits/rejected": -3.8460936546325684, "logps/chosen": -314.8999938964844, "logps/rejected": -356.23748779296875, "loss": 0.2774, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.49787598848342896, "rewards/margins": 6.515234470367432, "rewards/rejected": -6.016992092132568, "step": 425 }, { "epoch": 0.3011204481792717, "grad_norm": 6.173446178436279, "learning_rate": 8.826727734100741e-07, "logits/chosen": -3.994921922683716, "logits/rejected": -3.867968797683716, "logps/chosen": -317.0562438964844, "logps/rejected": -348.3125, "loss": 0.1749, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.03021240234375, "rewards/margins": 6.094140529632568, "rewards/rejected": -6.126562595367432, "step": 430 }, { "epoch": 0.30462184873949577, "grad_norm": 2.559511423110962, "learning_rate": 8.787104467361441e-07, "logits/chosen": -3.9769530296325684, "logits/rejected": -3.879687547683716, "logps/chosen": -313.11248779296875, "logps/rejected": -349.92498779296875, "loss": 0.0598, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.4130493104457855, "rewards/margins": 6.033593654632568, "rewards/rejected": -6.443163871765137, "step": 435 }, { "epoch": 0.3081232492997199, "grad_norm": 14.759501457214355, "learning_rate": 8.746915306248485e-07, "logits/chosen": -3.9320311546325684, "logits/rejected": -3.7914061546325684, "logps/chosen": -285.2749938964844, "logps/rejected": -328.20001220703125, "loss": 0.1598, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.0230560302734375, "rewards/margins": 5.8984375, "rewards/rejected": -5.926953315734863, "step": 440 }, { "epoch": 0.311624649859944, "grad_norm": 12.394731521606445, "learning_rate": 8.706166256094012e-07, "logits/chosen": -3.882031202316284, "logits/rejected": -3.7718749046325684, "logps/chosen": -299.70623779296875, "logps/rejected": -334.76251220703125, "loss": 0.2544, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.664074718952179, "rewards/margins": 5.223828315734863, "rewards/rejected": -4.555859565734863, "step": 445 }, { "epoch": 0.31512605042016806, "grad_norm": 9.317229270935059, "learning_rate": 8.664863405892504e-07, "logits/chosen": -3.80859375, "logits/rejected": -3.715625047683716, "logps/chosen": -285.53125, "logps/rejected": -320.8999938964844, "loss": 0.1249, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.667736828327179, "rewards/margins": 5.558789253234863, "rewards/rejected": -4.88818359375, "step": 450 }, { "epoch": 0.31862745098039214, "grad_norm": 4.753352642059326, "learning_rate": 8.623012927390934e-07, "logits/chosen": -3.755859375, "logits/rejected": -3.635937452316284, "logps/chosen": -316.8500061035156, "logps/rejected": -344.42498779296875, "loss": 0.1457, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.5716552734375, "rewards/margins": 5.596875190734863, "rewards/rejected": -4.024365425109863, "step": 455 }, { "epoch": 0.32212885154061627, "grad_norm": 5.382900238037109, "learning_rate": 8.580621074166552e-07, "logits/chosen": -3.954296827316284, "logits/rejected": -3.7894530296325684, "logps/chosen": -315.0062561035156, "logps/rejected": -343.9375, "loss": 0.1202, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.381860375404358, "rewards/margins": 6.178906440734863, "rewards/rejected": -4.7978515625, "step": 460 }, { "epoch": 0.32563025210084034, "grad_norm": 9.907479286193848, "learning_rate": 8.537694180692416e-07, "logits/chosen": -3.998828172683716, "logits/rejected": -3.8160157203674316, "logps/chosen": -307.23126220703125, "logps/rejected": -352.95001220703125, "loss": 0.1391, "rewards/accuracies": 0.9375, "rewards/chosen": 0.946850597858429, "rewards/margins": 6.853125095367432, "rewards/rejected": -5.913769721984863, "step": 465 }, { "epoch": 0.3291316526610644, "grad_norm": 6.668959617614746, "learning_rate": 8.494238661390864e-07, "logits/chosen": -4.036328315734863, "logits/rejected": -3.852734327316284, "logps/chosen": -341.2749938964844, "logps/rejected": -377.70001220703125, "loss": 0.1029, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.559814453125, "rewards/margins": 6.66796875, "rewards/rejected": -6.106640815734863, "step": 470 }, { "epoch": 0.3326330532212885, "grad_norm": 4.653944492340088, "learning_rate": 8.450261009675029e-07, "logits/chosen": -4.055078029632568, "logits/rejected": -3.8324217796325684, "logps/chosen": -332.875, "logps/rejected": -368.5249938964844, "loss": 0.1133, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8369506597518921, "rewards/margins": 6.491796970367432, "rewards/rejected": -7.325781345367432, "step": 475 }, { "epoch": 0.33613445378151263, "grad_norm": 4.796830177307129, "learning_rate": 8.405767796978544e-07, "logits/chosen": -4.044921875, "logits/rejected": -3.811328172683716, "logps/chosen": -332.76251220703125, "logps/rejected": -384.2250061035156, "loss": 0.1567, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.9822021722793579, "rewards/margins": 6.829297065734863, "rewards/rejected": -7.808203220367432, "step": 480 }, { "epoch": 0.3396358543417367, "grad_norm": 3.359666585922241, "learning_rate": 8.360765671773602e-07, "logits/chosen": -3.9605469703674316, "logits/rejected": -3.8082032203674316, "logps/chosen": -288.1499938964844, "logps/rejected": -325.57501220703125, "loss": 0.0853, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -2.07318115234375, "rewards/margins": 6.422265529632568, "rewards/rejected": -8.494531631469727, "step": 485 }, { "epoch": 0.3431372549019608, "grad_norm": 6.423489093780518, "learning_rate": 8.315261358577484e-07, "logits/chosen": -3.9214844703674316, "logits/rejected": -3.7406249046325684, "logps/chosen": -318.4312438964844, "logps/rejected": -365.95001220703125, "loss": 0.1519, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -2.554150342941284, "rewards/margins": 6.6875, "rewards/rejected": -9.240819931030273, "step": 490 }, { "epoch": 0.34663865546218486, "grad_norm": 2.9223127365112305, "learning_rate": 8.269261656947755e-07, "logits/chosen": -3.9156250953674316, "logits/rejected": -3.7308592796325684, "logps/chosen": -343.4375, "logps/rejected": -387.5, "loss": 0.0976, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.646899461746216, "rewards/margins": 6.696875095367432, "rewards/rejected": -9.341015815734863, "step": 495 }, { "epoch": 0.35014005602240894, "grad_norm": 6.421755790710449, "learning_rate": 8.22277344046621e-07, "logits/chosen": -3.90625, "logits/rejected": -3.739453077316284, "logps/chosen": -348.51251220703125, "logps/rejected": -373.5249938964844, "loss": 0.149, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7789063453674316, "rewards/margins": 5.763085842132568, "rewards/rejected": -8.544921875, "step": 500 }, { "epoch": 0.3536414565826331, "grad_norm": 5.0626912117004395, "learning_rate": 8.175803655711799e-07, "logits/chosen": -3.9496092796325684, "logits/rejected": -3.78125, "logps/chosen": -323.8687438964844, "logps/rejected": -360.26251220703125, "loss": 0.1334, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.853271484375, "rewards/margins": 6.216015815734863, "rewards/rejected": -8.064453125, "step": 505 }, { "epoch": 0.35714285714285715, "grad_norm": 19.834314346313477, "learning_rate": 8.1283593212226e-07, "logits/chosen": -4.016406059265137, "logits/rejected": -3.833203077316284, "logps/chosen": -342.1000061035156, "logps/rejected": -373.04998779296875, "loss": 0.2158, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6996338367462158, "rewards/margins": 5.773046970367432, "rewards/rejected": -7.474609375, "step": 510 }, { "epoch": 0.36064425770308123, "grad_norm": 6.62537145614624, "learning_rate": 8.080447526447078e-07, "logits/chosen": -4.09765625, "logits/rejected": -3.869921922683716, "logps/chosen": -325.82501220703125, "logps/rejected": -370.8999938964844, "loss": 0.2546, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.572802722454071, "rewards/margins": 6.523633003234863, "rewards/rejected": -7.099609375, "step": 515 }, { "epoch": 0.3641456582633053, "grad_norm": 6.350283622741699, "learning_rate": 8.032075430684724e-07, "logits/chosen": -4.087500095367432, "logits/rejected": -3.8460936546325684, "logps/chosen": -323.48748779296875, "logps/rejected": -367.32501220703125, "loss": 0.133, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.2872070372104645, "rewards/margins": 7.222460746765137, "rewards/rejected": -7.507421970367432, "step": 520 }, { "epoch": 0.36764705882352944, "grad_norm": 3.5338668823242188, "learning_rate": 7.983250262016275e-07, "logits/chosen": -4.032812595367432, "logits/rejected": -3.787890672683716, "logps/chosen": -334.2250061035156, "logps/rejected": -380.45001220703125, "loss": 0.0534, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.2791503965854645, "rewards/margins": 8.095507621765137, "rewards/rejected": -8.372265815734863, "step": 525 }, { "epoch": 0.3711484593837535, "grad_norm": 3.9917259216308594, "learning_rate": 7.933979316223631e-07, "logits/chosen": -3.942187547683716, "logits/rejected": -3.7660155296325684, "logps/chosen": -311.01251220703125, "logps/rejected": -357.2250061035156, "loss": 0.1091, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9442993402481079, "rewards/margins": 7.354687690734863, "rewards/rejected": -8.307812690734863, "step": 530 }, { "epoch": 0.3746498599439776, "grad_norm": 9.341059684753418, "learning_rate": 7.884269955699687e-07, "logits/chosen": -3.94921875, "logits/rejected": -3.740234375, "logps/chosen": -311.42498779296875, "logps/rejected": -348.1499938964844, "loss": 0.1043, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.904492199420929, "rewards/margins": 6.862109184265137, "rewards/rejected": -7.762499809265137, "step": 535 }, { "epoch": 0.37815126050420167, "grad_norm": 13.346263885498047, "learning_rate": 7.834129608348181e-07, "logits/chosen": -3.950000047683716, "logits/rejected": -3.7269530296325684, "logps/chosen": -337.54998779296875, "logps/rejected": -391.3500061035156, "loss": 0.1693, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.5969909429550171, "rewards/margins": 7.047656059265137, "rewards/rejected": -7.643945217132568, "step": 540 }, { "epoch": 0.38165266106442575, "grad_norm": 6.5441107749938965, "learning_rate": 7.783565766473776e-07, "logits/chosen": -3.940624952316284, "logits/rejected": -3.7562499046325684, "logps/chosen": -308.6312561035156, "logps/rejected": -345.5375061035156, "loss": 0.0949, "rewards/accuracies": 0.96875, "rewards/chosen": -1.02398681640625, "rewards/margins": 6.360937595367432, "rewards/rejected": -7.38427734375, "step": 545 }, { "epoch": 0.3851540616246499, "grad_norm": 2.8823161125183105, "learning_rate": 7.732585985662509e-07, "logits/chosen": -4.045702934265137, "logits/rejected": -3.8296875953674316, "logps/chosen": -317.6875, "logps/rejected": -366.54998779296875, "loss": 0.1415, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16395263373851776, "rewards/margins": 6.898633003234863, "rewards/rejected": -6.735644340515137, "step": 550 }, { "epoch": 0.38865546218487396, "grad_norm": 9.686930656433105, "learning_rate": 7.681197883652779e-07, "logits/chosen": -4.041406154632568, "logits/rejected": -3.8304686546325684, "logps/chosen": -309.0375061035156, "logps/rejected": -351.04998779296875, "loss": 0.0839, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4409423768520355, "rewards/margins": 6.984765529632568, "rewards/rejected": -6.541894435882568, "step": 555 }, { "epoch": 0.39215686274509803, "grad_norm": 10.684289932250977, "learning_rate": 7.629409139197062e-07, "logits/chosen": -3.9789061546325684, "logits/rejected": -3.748828172683716, "logps/chosen": -335.8500061035156, "logps/rejected": -377.6499938964844, "loss": 0.0957, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0236084461212158, "rewards/margins": 7.436327934265137, "rewards/rejected": -8.459375381469727, "step": 560 }, { "epoch": 0.3956582633053221, "grad_norm": 40.065433502197266, "learning_rate": 7.577227490914494e-07, "logits/chosen": -4.010546684265137, "logits/rejected": -3.7425780296325684, "logps/chosen": -329.25, "logps/rejected": -394.92498779296875, "loss": 0.1063, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.4480224549770355, "rewards/margins": 8.193359375, "rewards/rejected": -8.646484375, "step": 565 }, { "epoch": 0.39915966386554624, "grad_norm": 9.474442481994629, "learning_rate": 7.52466073613452e-07, "logits/chosen": -3.9765625, "logits/rejected": -3.7738280296325684, "logps/chosen": -309.57501220703125, "logps/rejected": -359.4750061035156, "loss": 0.0688, "rewards/accuracies": 0.96875, "rewards/chosen": 0.10063476860523224, "rewards/margins": 7.375390529632568, "rewards/rejected": -7.2734375, "step": 570 }, { "epoch": 0.4026610644257703, "grad_norm": 10.86375904083252, "learning_rate": 7.471716729731763e-07, "logits/chosen": -3.9957032203674316, "logits/rejected": -3.723828077316284, "logps/chosen": -303.45001220703125, "logps/rejected": -363.125, "loss": 0.0727, "rewards/accuracies": 0.96875, "rewards/chosen": 1.292626976966858, "rewards/margins": 9.460546493530273, "rewards/rejected": -8.175000190734863, "step": 575 }, { "epoch": 0.4061624649859944, "grad_norm": 10.98038101196289, "learning_rate": 7.418403382952292e-07, "logits/chosen": -4.019140720367432, "logits/rejected": -3.759765625, "logps/chosen": -297.83123779296875, "logps/rejected": -359.3500061035156, "loss": 0.1873, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.3828125, "rewards/margins": 8.557812690734863, "rewards/rejected": -7.180371284484863, "step": 580 }, { "epoch": 0.4096638655462185, "grad_norm": 7.4733500480651855, "learning_rate": 7.364728662231483e-07, "logits/chosen": -4.013281345367432, "logits/rejected": -3.744921922683716, "logps/chosen": -317.73748779296875, "logps/rejected": -381.51251220703125, "loss": 0.0816, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.47197264432907104, "rewards/margins": 9.052343368530273, "rewards/rejected": -8.585156440734863, "step": 585 }, { "epoch": 0.41316526610644255, "grad_norm": 6.493350028991699, "learning_rate": 7.310700588003605e-07, "logits/chosen": -4.008203029632568, "logits/rejected": -3.751171827316284, "logps/chosen": -319.67498779296875, "logps/rejected": -392.48748779296875, "loss": 0.0859, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6444946527481079, "rewards/margins": 8.985156059265137, "rewards/rejected": -8.343066215515137, "step": 590 }, { "epoch": 0.4166666666666667, "grad_norm": 5.3325514793396, "learning_rate": 7.256327233503364e-07, "logits/chosen": -4.022656440734863, "logits/rejected": -3.778125047683716, "logps/chosen": -324.8687438964844, "logps/rejected": -389.88751220703125, "loss": 0.0995, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.2916259765625, "rewards/margins": 8.142187118530273, "rewards/rejected": -9.428418159484863, "step": 595 }, { "epoch": 0.42016806722689076, "grad_norm": 9.191666603088379, "learning_rate": 7.201616723559547e-07, "logits/chosen": -4.094531059265137, "logits/rejected": -3.783203125, "logps/chosen": -308.5625, "logps/rejected": -366.61248779296875, "loss": 0.0918, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4765869081020355, "rewards/margins": 8.8955078125, "rewards/rejected": -8.419530868530273, "step": 600 }, { "epoch": 0.42366946778711484, "grad_norm": 6.363277435302734, "learning_rate": 7.146577233380952e-07, "logits/chosen": -4.1796875, "logits/rejected": -3.88671875, "logps/chosen": -320.67498779296875, "logps/rejected": -369.1499938964844, "loss": 0.1508, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.5018554925918579, "rewards/margins": 7.989453315734863, "rewards/rejected": -7.481738090515137, "step": 605 }, { "epoch": 0.4271708683473389, "grad_norm": 8.64117431640625, "learning_rate": 7.091216987334791e-07, "logits/chosen": -4.275000095367432, "logits/rejected": -3.9605469703674316, "logps/chosen": -324.36248779296875, "logps/rejected": -378.2124938964844, "loss": 0.1249, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.914660632610321, "rewards/margins": 8.302734375, "rewards/rejected": -7.39453125, "step": 610 }, { "epoch": 0.43067226890756305, "grad_norm": 5.129154682159424, "learning_rate": 7.035544257717761e-07, "logits/chosen": -4.269921779632568, "logits/rejected": -4.025781154632568, "logps/chosen": -281.7250061035156, "logps/rejected": -344.3500061035156, "loss": 0.111, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6723266839981079, "rewards/margins": 7.028906345367432, "rewards/rejected": -6.359179496765137, "step": 615 }, { "epoch": 0.4341736694677871, "grad_norm": 4.3600969314575195, "learning_rate": 6.979567363519926e-07, "logits/chosen": -4.233593940734863, "logits/rejected": -4.010937690734863, "logps/chosen": -288.03125, "logps/rejected": -341.3500061035156, "loss": 0.0781, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.39274901151657104, "rewards/margins": 6.916796684265137, "rewards/rejected": -6.522265434265137, "step": 620 }, { "epoch": 0.4376750700280112, "grad_norm": 10.305990219116211, "learning_rate": 6.923294669181659e-07, "logits/chosen": -4.236328125, "logits/rejected": -3.9925780296325684, "logps/chosen": -301.6875, "logps/rejected": -345.95001220703125, "loss": 0.1325, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3890869617462158, "rewards/margins": 7.2578125, "rewards/rejected": -5.867968559265137, "step": 625 }, { "epoch": 0.4411764705882353, "grad_norm": 1.9760684967041016, "learning_rate": 6.866734583343751e-07, "logits/chosen": -4.197656154632568, "logits/rejected": -3.9476561546325684, "logps/chosen": -322.34375, "logps/rejected": -374.51251220703125, "loss": 0.073, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.05327148362994194, "rewards/margins": 7.111328125, "rewards/rejected": -7.168749809265137, "step": 630 }, { "epoch": 0.44467787114845936, "grad_norm": 5.447768688201904, "learning_rate": 6.809895557590949e-07, "logits/chosen": -4.140234470367432, "logits/rejected": -3.901562452316284, "logps/chosen": -305.3999938964844, "logps/rejected": -357.5, "loss": 0.1295, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.04921875149011612, "rewards/margins": 7.282812595367432, "rewards/rejected": -7.23388671875, "step": 635 }, { "epoch": 0.4481792717086835, "grad_norm": 11.747628211975098, "learning_rate": 6.752786085189059e-07, "logits/chosen": -4.054296970367432, "logits/rejected": -3.833203077316284, "logps/chosen": -325.125, "logps/rejected": -365.7749938964844, "loss": 0.143, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.105444312095642, "rewards/margins": 7.122265815734863, "rewards/rejected": -8.227734565734863, "step": 640 }, { "epoch": 0.45168067226890757, "grad_norm": 9.866388320922852, "learning_rate": 6.695414699815826e-07, "logits/chosen": -4.037499904632568, "logits/rejected": -3.8316407203674316, "logps/chosen": -269.61248779296875, "logps/rejected": -324.20001220703125, "loss": 0.0917, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7059570550918579, "rewards/margins": 7.22265625, "rewards/rejected": -7.929296970367432, "step": 645 }, { "epoch": 0.45518207282913165, "grad_norm": 3.863745927810669, "learning_rate": 6.637789974285779e-07, "logits/chosen": -4.09375, "logits/rejected": -3.8394532203674316, "logps/chosen": -295.75, "logps/rejected": -346.57501220703125, "loss": 0.0851, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.2662353515625, "rewards/margins": 7.594531059265137, "rewards/rejected": -5.324511528015137, "step": 650 }, { "epoch": 0.4586834733893557, "grad_norm": 6.13060998916626, "learning_rate": 6.579920519269218e-07, "logits/chosen": -4.114843845367432, "logits/rejected": -3.8179688453674316, "logps/chosen": -297.4624938964844, "logps/rejected": -366.67498779296875, "loss": 0.1536, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.949853539466858, "rewards/margins": 8.067968368530273, "rewards/rejected": -6.114843845367432, "step": 655 }, { "epoch": 0.46218487394957986, "grad_norm": 6.907744884490967, "learning_rate": 6.521814982005552e-07, "logits/chosen": -4.055468559265137, "logits/rejected": -3.8070311546325684, "logps/chosen": -276.32501220703125, "logps/rejected": -336.20001220703125, "loss": 0.1828, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.5094726085662842, "rewards/margins": 7.813281059265137, "rewards/rejected": -6.301660060882568, "step": 660 }, { "epoch": 0.46568627450980393, "grad_norm": 6.405279636383057, "learning_rate": 6.463482045011171e-07, "logits/chosen": -3.948437452316284, "logits/rejected": -3.7085938453674316, "logps/chosen": -301.625, "logps/rejected": -353.82501220703125, "loss": 0.0832, "rewards/accuracies": 0.96875, "rewards/chosen": 1.435766577720642, "rewards/margins": 7.907031059265137, "rewards/rejected": -6.471289157867432, "step": 665 }, { "epoch": 0.469187675070028, "grad_norm": 5.61862850189209, "learning_rate": 6.404930424782052e-07, "logits/chosen": -3.858203172683716, "logits/rejected": -3.6597657203674316, "logps/chosen": -322.6187438964844, "logps/rejected": -367.32501220703125, "loss": 0.2208, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.5012023448944092, "rewards/margins": 7.526757717132568, "rewards/rejected": -9.029296875, "step": 670 }, { "epoch": 0.4726890756302521, "grad_norm": 10.052380561828613, "learning_rate": 6.346168870491273e-07, "logits/chosen": -3.8499999046325684, "logits/rejected": -3.655468702316284, "logps/chosen": -309.7124938964844, "logps/rejected": -365.9750061035156, "loss": 0.1371, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.228515625, "rewards/margins": 7.833203315734863, "rewards/rejected": -7.608105659484863, "step": 675 }, { "epoch": 0.47619047619047616, "grad_norm": 7.156270980834961, "learning_rate": 6.287206162681662e-07, "logits/chosen": -3.8675780296325684, "logits/rejected": -3.66015625, "logps/chosen": -318.625, "logps/rejected": -382.82501220703125, "loss": 0.1899, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.8764892816543579, "rewards/margins": 8.061426162719727, "rewards/rejected": -8.932031631469727, "step": 680 }, { "epoch": 0.4796918767507003, "grad_norm": 6.979720115661621, "learning_rate": 6.228051111953742e-07, "logits/chosen": -3.839062452316284, "logits/rejected": -3.701953172683716, "logps/chosen": -318.3999938964844, "logps/rejected": -366.8500061035156, "loss": 0.163, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.12661132216453552, "rewards/margins": 7.118750095367432, "rewards/rejected": -7.241406440734863, "step": 685 }, { "epoch": 0.4831932773109244, "grad_norm": 11.366744041442871, "learning_rate": 6.168712557649193e-07, "logits/chosen": -3.895312547683716, "logits/rejected": -3.709765672683716, "logps/chosen": -302.20001220703125, "logps/rejected": -355.625, "loss": 0.1296, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.614208996295929, "rewards/margins": 8.010937690734863, "rewards/rejected": -7.394921779632568, "step": 690 }, { "epoch": 0.48669467787114845, "grad_norm": 8.56261920928955, "learning_rate": 6.109199366530035e-07, "logits/chosen": -3.893749952316284, "logits/rejected": -3.7281250953674316, "logps/chosen": -321.76251220703125, "logps/rejected": -376.7250061035156, "loss": 0.0851, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7763427495956421, "rewards/margins": 7.8515625, "rewards/rejected": -7.073632717132568, "step": 695 }, { "epoch": 0.49019607843137253, "grad_norm": 5.708214282989502, "learning_rate": 6.049520431453666e-07, "logits/chosen": -3.9175782203674316, "logits/rejected": -3.755078077316284, "logps/chosen": -321.2250061035156, "logps/rejected": -367.1000061035156, "loss": 0.0858, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5287841558456421, "rewards/margins": 7.557812690734863, "rewards/rejected": -7.0302734375, "step": 700 }, { "epoch": 0.49369747899159666, "grad_norm": 11.397648811340332, "learning_rate": 5.989684670044059e-07, "logits/chosen": -3.911328077316284, "logits/rejected": -3.764453172683716, "logps/chosen": -308.01251220703125, "logps/rejected": -362.76251220703125, "loss": 0.1002, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.741455078125, "rewards/margins": 7.514062404632568, "rewards/rejected": -6.768750190734863, "step": 705 }, { "epoch": 0.49719887955182074, "grad_norm": 6.1462860107421875, "learning_rate": 5.929701023359229e-07, "logits/chosen": -3.9410157203674316, "logits/rejected": -3.764843702316284, "logps/chosen": -292.5375061035156, "logps/rejected": -352.04998779296875, "loss": 0.1366, "rewards/accuracies": 0.9375, "rewards/chosen": 0.510815441608429, "rewards/margins": 7.702343940734863, "rewards/rejected": -7.189648628234863, "step": 710 }, { "epoch": 0.5007002801120448, "grad_norm": 5.117457866668701, "learning_rate": 5.86957845455518e-07, "logits/chosen": -3.962890625, "logits/rejected": -3.7679686546325684, "logps/chosen": -310.33123779296875, "logps/rejected": -368.1625061035156, "loss": 0.1361, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.5436767339706421, "rewards/margins": 7.6328125, "rewards/rejected": -7.092187404632568, "step": 715 }, { "epoch": 0.5042016806722689, "grad_norm": 7.32170295715332, "learning_rate": 5.809325947546595e-07, "logits/chosen": -4.034765720367432, "logits/rejected": -3.8238282203674316, "logps/chosen": -320.0249938964844, "logps/rejected": -375.1499938964844, "loss": 0.1057, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6477905511856079, "rewards/margins": 7.616796970367432, "rewards/rejected": -6.969922065734863, "step": 720 }, { "epoch": 0.507703081232493, "grad_norm": 1.7897793054580688, "learning_rate": 5.748952505664384e-07, "logits/chosen": -4.041406154632568, "logits/rejected": -3.8167967796325684, "logps/chosen": -287.5625, "logps/rejected": -352.625, "loss": 0.0551, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.556073009967804, "rewards/margins": 8.061718940734863, "rewards/rejected": -7.5107421875, "step": 725 }, { "epoch": 0.511204481792717, "grad_norm": 6.413627624511719, "learning_rate": 5.688467150310352e-07, "logits/chosen": -4.096484184265137, "logits/rejected": -3.844921827316284, "logps/chosen": -315.6187438964844, "logps/rejected": -367.70001220703125, "loss": 0.1556, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.9060913324356079, "rewards/margins": 7.874218940734863, "rewards/rejected": -6.9638671875, "step": 730 }, { "epoch": 0.5147058823529411, "grad_norm": 6.02368688583374, "learning_rate": 5.627878919609162e-07, "logits/chosen": -4.142187595367432, "logits/rejected": -3.893749952316284, "logps/chosen": -342.5249938964844, "logps/rejected": -383.4750061035156, "loss": 0.0987, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.24305419623851776, "rewards/margins": 7.592187404632568, "rewards/rejected": -7.352246284484863, "step": 735 }, { "epoch": 0.5182072829131653, "grad_norm": 9.392160415649414, "learning_rate": 5.567196867057792e-07, "logits/chosen": -4.110547065734863, "logits/rejected": -3.856640577316284, "logps/chosen": -327.32501220703125, "logps/rejected": -368.6000061035156, "loss": 0.0851, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.1859862804412842, "rewards/margins": 7.625781059265137, "rewards/rejected": -8.806836128234863, "step": 740 }, { "epoch": 0.5217086834733894, "grad_norm": 7.507225513458252, "learning_rate": 5.506430060172713e-07, "logits/chosen": -4.096875190734863, "logits/rejected": -3.839062452316284, "logps/chosen": -352.70001220703125, "logps/rejected": -391.20001220703125, "loss": 0.1142, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.350988745689392, "rewards/margins": 7.507421970367432, "rewards/rejected": -8.854223251342773, "step": 745 }, { "epoch": 0.5252100840336135, "grad_norm": 6.702646255493164, "learning_rate": 5.445587579134949e-07, "logits/chosen": -4.087109565734863, "logits/rejected": -3.849609375, "logps/chosen": -300.8062438964844, "logps/rejected": -347.36248779296875, "loss": 0.1031, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7536376714706421, "rewards/margins": 7.333203315734863, "rewards/rejected": -8.081640243530273, "step": 750 }, { "epoch": 0.5287114845938375, "grad_norm": 2.128941774368286, "learning_rate": 5.38467851543326e-07, "logits/chosen": -4.013671875, "logits/rejected": -3.795703172683716, "logps/chosen": -322.1499938964844, "logps/rejected": -377.54998779296875, "loss": 0.0809, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7710937261581421, "rewards/margins": 7.187890529632568, "rewards/rejected": -7.957421779632568, "step": 755 }, { "epoch": 0.5322128851540616, "grad_norm": 6.914889335632324, "learning_rate": 5.323711970505627e-07, "logits/chosen": -3.9925780296325684, "logits/rejected": -3.772265672683716, "logps/chosen": -298.3125, "logps/rejected": -349.9375, "loss": 0.1101, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8685058355331421, "rewards/margins": 7.3203125, "rewards/rejected": -8.191991806030273, "step": 760 }, { "epoch": 0.5357142857142857, "grad_norm": 6.70955753326416, "learning_rate": 5.262697054379268e-07, "logits/chosen": -3.979687452316284, "logits/rejected": -3.748046875, "logps/chosen": -283.38751220703125, "logps/rejected": -351.4750061035156, "loss": 0.095, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.0792236328125, "rewards/margins": 7.942968845367432, "rewards/rejected": -6.858788967132568, "step": 765 }, { "epoch": 0.5392156862745098, "grad_norm": 36.12495422363281, "learning_rate": 5.201642884309341e-07, "logits/chosen": -3.9632811546325684, "logits/rejected": -3.719921827316284, "logps/chosen": -304.48748779296875, "logps/rejected": -369.1499938964844, "loss": 0.157, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.238244652748108, "rewards/margins": 8.524023056030273, "rewards/rejected": -7.287499904632568, "step": 770 }, { "epoch": 0.5427170868347339, "grad_norm": 14.315747261047363, "learning_rate": 5.140558583416589e-07, "logits/chosen": -3.999218702316284, "logits/rejected": -3.7222657203674316, "logps/chosen": -316.3999938964844, "logps/rejected": -378.625, "loss": 0.1734, "rewards/accuracies": 0.96875, "rewards/chosen": -0.00820312462747097, "rewards/margins": 8.339062690734863, "rewards/rejected": -8.352734565734863, "step": 775 }, { "epoch": 0.5462184873949579, "grad_norm": 1.9824923276901245, "learning_rate": 5.079453279324109e-07, "logits/chosen": -3.9703125953674316, "logits/rejected": -3.725781202316284, "logps/chosen": -296.875, "logps/rejected": -363.82501220703125, "loss": 0.1622, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.08635254204273224, "rewards/margins": 8.248437881469727, "rewards/rejected": -8.337890625, "step": 780 }, { "epoch": 0.5497198879551821, "grad_norm": 7.089279651641846, "learning_rate": 5.018336102793433e-07, "logits/chosen": -3.955859422683716, "logits/rejected": -3.7679686546325684, "logps/chosen": -265.8062438964844, "logps/rejected": -320.20001220703125, "loss": 0.1163, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.5814697742462158, "rewards/margins": 7.352734565734863, "rewards/rejected": -5.773046970367432, "step": 785 }, { "epoch": 0.5532212885154062, "grad_norm": 2.578310251235962, "learning_rate": 4.957216186360146e-07, "logits/chosen": -3.994140625, "logits/rejected": -3.713671922683716, "logps/chosen": -300.375, "logps/rejected": -361.125, "loss": 0.0588, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 2.330676317214966, "rewards/margins": 8.728124618530273, "rewards/rejected": -6.393945217132568, "step": 790 }, { "epoch": 0.5567226890756303, "grad_norm": 8.720524787902832, "learning_rate": 4.896102662969258e-07, "logits/chosen": -3.991406202316284, "logits/rejected": -3.7593750953674316, "logps/chosen": -297.4375, "logps/rejected": -344.1875, "loss": 0.0957, "rewards/accuracies": 0.96875, "rewards/chosen": 2.1497802734375, "rewards/margins": 7.647656440734863, "rewards/rejected": -5.4951171875, "step": 795 }, { "epoch": 0.5602240896358543, "grad_norm": 6.386475086212158, "learning_rate": 4.835004664610481e-07, "logits/chosen": -3.94140625, "logits/rejected": -3.7203125953674316, "logps/chosen": -287.70001220703125, "logps/rejected": -344.70001220703125, "loss": 0.1196, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8922485113143921, "rewards/margins": 7.640234470367432, "rewards/rejected": -6.748437404632568, "step": 800 }, { "epoch": 0.5637254901960784, "grad_norm": 3.925546646118164, "learning_rate": 4.773931320953675e-07, "logits/chosen": -3.856250047683716, "logits/rejected": -3.660937547683716, "logps/chosen": -325.95623779296875, "logps/rejected": -381.20001220703125, "loss": 0.0525, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.652050793170929, "rewards/margins": 7.946484565734863, "rewards/rejected": -8.599609375, "step": 805 }, { "epoch": 0.5672268907563025, "grad_norm": 10.991551399230957, "learning_rate": 4.7128917579846287e-07, "logits/chosen": -3.854687452316284, "logits/rejected": -3.5933594703674316, "logps/chosen": -343.26873779296875, "logps/rejected": -402.2749938964844, "loss": 0.0996, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.73876953125, "rewards/margins": 8.411328315734863, "rewards/rejected": -9.155468940734863, "step": 810 }, { "epoch": 0.5707282913165266, "grad_norm": 4.291808605194092, "learning_rate": 4.6518950966414013e-07, "logits/chosen": -3.806640625, "logits/rejected": -3.583984375, "logps/chosen": -301.67498779296875, "logps/rejected": -367.25, "loss": 0.0675, "rewards/accuracies": 0.96875, "rewards/chosen": -0.19460448622703552, "rewards/margins": 8.220312118530273, "rewards/rejected": -8.418749809265137, "step": 815 }, { "epoch": 0.5742296918767507, "grad_norm": 7.3030009269714355, "learning_rate": 4.590950451451397e-07, "logits/chosen": -3.8453125953674316, "logits/rejected": -3.600781202316284, "logps/chosen": -327.45001220703125, "logps/rejected": -386.07501220703125, "loss": 0.1054, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3786132335662842, "rewards/margins": 8.593358993530273, "rewards/rejected": -9.979687690734863, "step": 820 }, { "epoch": 0.5777310924369747, "grad_norm": 3.5457348823547363, "learning_rate": 4.530066929169427e-07, "logits/chosen": -3.8628907203674316, "logits/rejected": -3.5679688453674316, "logps/chosen": -346.4750061035156, "logps/rejected": -403.0249938964844, "loss": 0.0685, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.4722656309604645, "rewards/margins": 8.35546875, "rewards/rejected": -8.8291015625, "step": 825 }, { "epoch": 0.5812324929971989, "grad_norm": 2.543821096420288, "learning_rate": 4.469253627416905e-07, "logits/chosen": -3.877734422683716, "logits/rejected": -3.6019530296325684, "logps/chosen": -312.64373779296875, "logps/rejected": -373.82501220703125, "loss": 0.0816, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.34624022245407104, "rewards/margins": 9.057031631469727, "rewards/rejected": -9.40576171875, "step": 830 }, { "epoch": 0.584733893557423, "grad_norm": 8.854287147521973, "learning_rate": 4.4085196333224296e-07, "logits/chosen": -3.8851561546325684, "logits/rejected": -3.546875, "logps/chosen": -316.26873779296875, "logps/rejected": -381.32501220703125, "loss": 0.0958, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.4364257752895355, "rewards/margins": 8.853906631469727, "rewards/rejected": -9.285547256469727, "step": 835 }, { "epoch": 0.5882352941176471, "grad_norm": 8.832906723022461, "learning_rate": 4.347874022163919e-07, "logits/chosen": -3.852734327316284, "logits/rejected": -3.579296827316284, "logps/chosen": -299.01873779296875, "logps/rejected": -366.8500061035156, "loss": 0.0845, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5170654058456421, "rewards/margins": 8.686914443969727, "rewards/rejected": -9.208398818969727, "step": 840 }, { "epoch": 0.5917366946778712, "grad_norm": 15.137822151184082, "learning_rate": 4.2873258560125237e-07, "logits/chosen": -3.861328125, "logits/rejected": -3.583984375, "logps/chosen": -326.2749938964844, "logps/rejected": -385.9750061035156, "loss": 0.1595, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.552478015422821, "rewards/margins": 8.507421493530273, "rewards/rejected": -9.059374809265137, "step": 845 }, { "epoch": 0.5952380952380952, "grad_norm": 6.563608169555664, "learning_rate": 4.2268841823785126e-07, "logits/chosen": -3.8773436546325684, "logits/rejected": -3.607421875, "logps/chosen": -312.45623779296875, "logps/rejected": -370.125, "loss": 0.101, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.7504059076309204, "rewards/margins": 8.3125, "rewards/rejected": -9.064062118530273, "step": 850 }, { "epoch": 0.5987394957983193, "grad_norm": 13.671486854553223, "learning_rate": 4.166558032859338e-07, "logits/chosen": -3.848437547683716, "logits/rejected": -3.592968702316284, "logps/chosen": -313.1187438964844, "logps/rejected": -370.92498779296875, "loss": 0.178, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.0292236804962158, "rewards/margins": 8.178515434265137, "rewards/rejected": -9.208593368530273, "step": 855 }, { "epoch": 0.6022408963585434, "grad_norm": 24.43784523010254, "learning_rate": 4.1063564217900617e-07, "logits/chosen": -3.866406202316284, "logits/rejected": -3.6640625, "logps/chosen": -322.48126220703125, "logps/rejected": -370.375, "loss": 0.1452, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.087890625, "rewards/margins": 7.253515720367432, "rewards/rejected": -8.346094131469727, "step": 860 }, { "epoch": 0.6057422969187675, "grad_norm": 4.205888748168945, "learning_rate": 4.0462883448963867e-07, "logits/chosen": -3.9609375, "logits/rejected": -3.7269530296325684, "logps/chosen": -310.83123779296875, "logps/rejected": -362.4375, "loss": 0.096, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.8477843999862671, "rewards/margins": 7.521874904632568, "rewards/rejected": -8.370312690734863, "step": 865 }, { "epoch": 0.6092436974789915, "grad_norm": 5.241097927093506, "learning_rate": 3.9863627779504473e-07, "logits/chosen": -3.971874952316284, "logits/rejected": -3.764843702316284, "logps/chosen": -308.04998779296875, "logps/rejected": -353.75, "loss": 0.0758, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.34832763671875, "rewards/margins": 7.138671875, "rewards/rejected": -7.488476753234863, "step": 870 }, { "epoch": 0.6127450980392157, "grad_norm": 5.45101261138916, "learning_rate": 3.9265886754295907e-07, "logits/chosen": -3.916015625, "logits/rejected": -3.7066407203674316, "logps/chosen": -304.71875, "logps/rejected": -361.8062438964844, "loss": 0.1716, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.31901854276657104, "rewards/margins": 6.958300590515137, "rewards/rejected": -6.638476371765137, "step": 875 }, { "epoch": 0.6162464985994398, "grad_norm": 1.5118215084075928, "learning_rate": 3.866974969178347e-07, "logits/chosen": -3.9496092796325684, "logits/rejected": -3.727343797683716, "logps/chosen": -293.2124938964844, "logps/rejected": -342.0249938964844, "loss": 0.0315, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1063232421875, "rewards/margins": 7.92578125, "rewards/rejected": -6.819726467132568, "step": 880 }, { "epoch": 0.6197478991596639, "grad_norm": 2.3815932273864746, "learning_rate": 3.80753056707376e-07, "logits/chosen": -3.957812547683716, "logits/rejected": -3.724609375, "logps/chosen": -316.8999938964844, "logps/rejected": -375.25, "loss": 0.0819, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2796142101287842, "rewards/margins": 8.724218368530273, "rewards/rejected": -7.444921970367432, "step": 885 }, { "epoch": 0.623249299719888, "grad_norm": 3.3432934284210205, "learning_rate": 3.7482643516943233e-07, "logits/chosen": -3.9781250953674316, "logits/rejected": -3.7242188453674316, "logps/chosen": -305.8062438964844, "logps/rejected": -366.07501220703125, "loss": 0.1337, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.043957471847534, "rewards/margins": 8.4375, "rewards/rejected": -6.396679878234863, "step": 890 }, { "epoch": 0.626750700280112, "grad_norm": 84.51541137695312, "learning_rate": 3.6891851789926885e-07, "logits/chosen": -3.908203125, "logits/rejected": -3.701171875, "logps/chosen": -296.64373779296875, "logps/rejected": -356.2250061035156, "loss": 0.1413, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6126953363418579, "rewards/margins": 7.667187690734863, "rewards/rejected": -7.053906440734863, "step": 895 }, { "epoch": 0.6302521008403361, "grad_norm": 1.273039698600769, "learning_rate": 3.6303018769723367e-07, "logits/chosen": -3.9496092796325684, "logits/rejected": -3.669921875, "logps/chosen": -341.0249938964844, "logps/rejected": -400.8999938964844, "loss": 0.0739, "rewards/accuracies": 0.96875, "rewards/chosen": 0.22305908799171448, "rewards/margins": 8.662500381469727, "rewards/rejected": -8.442187309265137, "step": 900 }, { "epoch": 0.6337535014005602, "grad_norm": 5.358231544494629, "learning_rate": 3.571623244368448e-07, "logits/chosen": -3.91015625, "logits/rejected": -3.6714844703674316, "logps/chosen": -323.13751220703125, "logps/rejected": -372.6000061035156, "loss": 0.0644, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.17644043266773224, "rewards/margins": 7.969531059265137, "rewards/rejected": -8.137890815734863, "step": 905 }, { "epoch": 0.6372549019607843, "grad_norm": 5.974215984344482, "learning_rate": 3.51315804933314e-07, "logits/chosen": -3.9496092796325684, "logits/rejected": -3.6480469703674316, "logps/chosen": -346.0249938964844, "logps/rejected": -419.6000061035156, "loss": 0.0908, "rewards/accuracies": 0.96875, "rewards/chosen": 0.17644043266773224, "rewards/margins": 8.888280868530273, "rewards/rejected": -8.713549613952637, "step": 910 }, { "epoch": 0.6407563025210085, "grad_norm": 2.7069153785705566, "learning_rate": 3.454915028125263e-07, "logits/chosen": -3.954296827316284, "logits/rejected": -3.6468749046325684, "logps/chosen": -333.4750061035156, "logps/rejected": -401.4750061035156, "loss": 0.0515, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.6935790777206421, "rewards/margins": 9.053906440734863, "rewards/rejected": -8.358495712280273, "step": 915 }, { "epoch": 0.6442577030812325, "grad_norm": 44.74658203125, "learning_rate": 3.396902883804976e-07, "logits/chosen": -3.8921875953674316, "logits/rejected": -3.6019530296325684, "logps/chosen": -312.125, "logps/rejected": -377.32501220703125, "loss": 0.0647, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.424072265625, "rewards/margins": 9.225000381469727, "rewards/rejected": -8.80078125, "step": 920 }, { "epoch": 0.6477591036414566, "grad_norm": 7.319411277770996, "learning_rate": 3.3391302849332753e-07, "logits/chosen": -3.856640577316284, "logits/rejected": -3.626171827316284, "logps/chosen": -313.0874938964844, "logps/rejected": -377.23748779296875, "loss": 0.0942, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7853729128837585, "rewards/margins": 9.032812118530273, "rewards/rejected": -8.244531631469727, "step": 925 }, { "epoch": 0.6512605042016807, "grad_norm": 11.135540008544922, "learning_rate": 3.28160586427668e-07, "logits/chosen": -3.885937452316284, "logits/rejected": -3.6246094703674316, "logps/chosen": -288.7437438964844, "logps/rejected": -351.0249938964844, "loss": 0.142, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.77581787109375, "rewards/margins": 9.06640625, "rewards/rejected": -8.296093940734863, "step": 930 }, { "epoch": 0.6547619047619048, "grad_norm": 4.436044692993164, "learning_rate": 3.2243382175172686e-07, "logits/chosen": -3.89453125, "logits/rejected": -3.627734422683716, "logps/chosen": -311.86248779296875, "logps/rejected": -375.1000061035156, "loss": 0.1683, "rewards/accuracies": 0.9375, "rewards/chosen": 0.195068359375, "rewards/margins": 8.670312881469727, "rewards/rejected": -8.473437309265137, "step": 935 }, { "epoch": 0.6582633053221288, "grad_norm": 3.3467283248901367, "learning_rate": 3.167335901968253e-07, "logits/chosen": -3.8296875953674316, "logits/rejected": -3.5863280296325684, "logps/chosen": -284.07501220703125, "logps/rejected": -342.4125061035156, "loss": 0.0814, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.4918579161167145, "rewards/margins": 7.952343940734863, "rewards/rejected": -7.465429782867432, "step": 940 }, { "epoch": 0.6617647058823529, "grad_norm": 12.231038093566895, "learning_rate": 3.1106074352952887e-07, "logits/chosen": -3.879687547683716, "logits/rejected": -3.5738282203674316, "logps/chosen": -346.2250061035156, "logps/rejected": -409.0, "loss": 0.108, "rewards/accuracies": 0.96875, "rewards/chosen": 0.16412964463233948, "rewards/margins": 8.977343559265137, "rewards/rejected": -8.80859375, "step": 945 }, { "epoch": 0.665266106442577, "grad_norm": 3.6774492263793945, "learning_rate": 3.054161294243709e-07, "logits/chosen": -3.850390672683716, "logits/rejected": -3.6050782203674316, "logps/chosen": -337.42498779296875, "logps/rejected": -405.875, "loss": 0.1166, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9543212652206421, "rewards/margins": 8.693359375, "rewards/rejected": -9.644067764282227, "step": 950 }, { "epoch": 0.6687675070028011, "grad_norm": 0.6831060647964478, "learning_rate": 2.998005913371868e-07, "logits/chosen": -3.8199219703674316, "logits/rejected": -3.5648436546325684, "logps/chosen": -302.6000061035156, "logps/rejected": -353.5249938964844, "loss": 0.0515, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.94317626953125, "rewards/margins": 8.862500190734863, "rewards/rejected": -7.921533107757568, "step": 955 }, { "epoch": 0.6722689075630253, "grad_norm": 5.337077617645264, "learning_rate": 2.9421496837908034e-07, "logits/chosen": -3.8570313453674316, "logits/rejected": -3.573046922683716, "logps/chosen": -304.38751220703125, "logps/rejected": -363.8999938964844, "loss": 0.0926, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.72064208984375, "rewards/margins": 8.595703125, "rewards/rejected": -7.8828125, "step": 960 }, { "epoch": 0.6757703081232493, "grad_norm": 4.56184196472168, "learning_rate": 2.88660095191037e-07, "logits/chosen": -3.887890577316284, "logits/rejected": -3.602734327316284, "logps/chosen": -315.4750061035156, "logps/rejected": -383.875, "loss": 0.1173, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.33234864473342896, "rewards/margins": 8.784375190734863, "rewards/rejected": -9.114501953125, "step": 965 }, { "epoch": 0.6792717086834734, "grad_norm": 0.696527361869812, "learning_rate": 2.831368018192071e-07, "logits/chosen": -3.913281202316284, "logits/rejected": -3.602343797683716, "logps/chosen": -332.07501220703125, "logps/rejected": -395.6000061035156, "loss": 0.079, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6913818120956421, "rewards/margins": 8.940038681030273, "rewards/rejected": -8.248046875, "step": 970 }, { "epoch": 0.6827731092436975, "grad_norm": 2.46315598487854, "learning_rate": 2.7764591359087414e-07, "logits/chosen": -3.893359422683716, "logits/rejected": -3.5999999046325684, "logps/chosen": -312.73748779296875, "logps/rejected": -367.2875061035156, "loss": 0.0655, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8863891363143921, "rewards/margins": 8.484375, "rewards/rejected": -7.603515625, "step": 975 }, { "epoch": 0.6862745098039216, "grad_norm": 2.042008876800537, "learning_rate": 2.721882509911296e-07, "logits/chosen": -3.9203124046325684, "logits/rejected": -3.582812547683716, "logps/chosen": -347.5874938964844, "logps/rejected": -419.20001220703125, "loss": 0.1541, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.3082641661167145, "rewards/margins": 9.401562690734863, "rewards/rejected": -9.099413871765137, "step": 980 }, { "epoch": 0.6897759103641457, "grad_norm": 1.3312026262283325, "learning_rate": 2.6676462954027033e-07, "logits/chosen": -3.867968797683716, "logits/rejected": -3.5484375953674316, "logps/chosen": -304.625, "logps/rejected": -371.625, "loss": 0.0338, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.755627453327179, "rewards/margins": 9.58203125, "rewards/rejected": -8.826171875, "step": 985 }, { "epoch": 0.6932773109243697, "grad_norm": 2.3855769634246826, "learning_rate": 2.6137585967193725e-07, "logits/chosen": -3.8617186546325684, "logits/rejected": -3.551953077316284, "logps/chosen": -303.61248779296875, "logps/rejected": -385.3374938964844, "loss": 0.0761, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.10163573920726776, "rewards/margins": 9.792577743530273, "rewards/rejected": -9.689062118530273, "step": 990 }, { "epoch": 0.6967787114845938, "grad_norm": 11.995887756347656, "learning_rate": 2.560227466120164e-07, "logits/chosen": -3.828125, "logits/rejected": -3.534374952316284, "logps/chosen": -302.66876220703125, "logps/rejected": -365.88751220703125, "loss": 0.1035, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.35798341035842896, "rewards/margins": 8.803906440734863, "rewards/rejected": -9.166015625, "step": 995 }, { "epoch": 0.7002801120448179, "grad_norm": 15.732640266418457, "learning_rate": 2.5070609025831604e-07, "logits/chosen": -3.81640625, "logits/rejected": -3.471484422683716, "logps/chosen": -325.4624938964844, "logps/rejected": -398.7250061035156, "loss": 0.1369, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.690185546875, "rewards/margins": 9.998437881469727, "rewards/rejected": -9.308496475219727, "step": 1000 }, { "epoch": 0.7037815126050421, "grad_norm": 11.053906440734863, "learning_rate": 2.454266850610398e-07, "logits/chosen": -3.829296827316284, "logits/rejected": -3.542187452316284, "logps/chosen": -310.21875, "logps/rejected": -370.125, "loss": 0.1538, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.14753417670726776, "rewards/margins": 9.098437309265137, "rewards/rejected": -8.958593368530273, "step": 1005 }, { "epoch": 0.7072829131652661, "grad_norm": 12.015442848205566, "learning_rate": 2.4018531990407595e-07, "logits/chosen": -3.794140577316284, "logits/rejected": -3.5640625953674316, "logps/chosen": -264.40625, "logps/rejected": -331.7124938964844, "loss": 0.0833, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.13149413466453552, "rewards/margins": 8.956250190734863, "rewards/rejected": -8.8291015625, "step": 1010 }, { "epoch": 0.7107843137254902, "grad_norm": 1.9385806322097778, "learning_rate": 2.3498277798711723e-07, "logits/chosen": -3.854296922683716, "logits/rejected": -3.48828125, "logps/chosen": -366.0249938964844, "logps/rejected": -437.6000061035156, "loss": 0.1108, "rewards/accuracies": 0.96875, "rewards/chosen": 0.49018555879592896, "rewards/margins": 10.124218940734863, "rewards/rejected": -9.637890815734863, "step": 1015 }, { "epoch": 0.7142857142857143, "grad_norm": 1.8205724954605103, "learning_rate": 2.298198367086279e-07, "logits/chosen": -3.859375, "logits/rejected": -3.534374952316284, "logps/chosen": -300.01251220703125, "logps/rejected": -355.3500061035156, "loss": 0.1203, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.8221191167831421, "rewards/margins": 8.935937881469727, "rewards/rejected": -8.110547065734863, "step": 1020 }, { "epoch": 0.7177871148459384, "grad_norm": 6.349449634552002, "learning_rate": 2.2469726754968204e-07, "logits/chosen": -3.837890625, "logits/rejected": -3.553515672683716, "logps/chosen": -317.36248779296875, "logps/rejected": -380.54998779296875, "loss": 0.0981, "rewards/accuracies": 0.96875, "rewards/chosen": 0.7828369140625, "rewards/margins": 8.682812690734863, "rewards/rejected": -7.900390625, "step": 1025 }, { "epoch": 0.7212885154061625, "grad_norm": 9.13239574432373, "learning_rate": 2.196158359586825e-07, "logits/chosen": -3.882031202316284, "logits/rejected": -3.592578172683716, "logps/chosen": -326.0874938964844, "logps/rejected": -394.54998779296875, "loss": 0.0875, "rewards/accuracies": 0.96875, "rewards/chosen": 1.2667236328125, "rewards/margins": 8.850000381469727, "rewards/rejected": -7.580956935882568, "step": 1030 }, { "epoch": 0.7247899159663865, "grad_norm": 4.96783971786499, "learning_rate": 2.1457630123698233e-07, "logits/chosen": -3.916015625, "logits/rejected": -3.5726561546325684, "logps/chosen": -297.82501220703125, "logps/rejected": -364.7250061035156, "loss": 0.0886, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.98199462890625, "rewards/margins": 9.078125, "rewards/rejected": -8.098437309265137, "step": 1035 }, { "epoch": 0.7282913165266106, "grad_norm": 1.335874080657959, "learning_rate": 2.0957941642542587e-07, "logits/chosen": -3.8882813453674316, "logits/rejected": -3.5667967796325684, "logps/chosen": -314.42498779296875, "logps/rejected": -383.2250061035156, "loss": 0.0919, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.3975830078125, "rewards/margins": 8.814844131469727, "rewards/rejected": -7.421679496765137, "step": 1040 }, { "epoch": 0.7317927170868347, "grad_norm": 10.026313781738281, "learning_rate": 2.0462592819182374e-07, "logits/chosen": -3.897656202316284, "logits/rejected": -3.575390577316284, "logps/chosen": -292.70623779296875, "logps/rejected": -351.1625061035156, "loss": 0.1123, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2625000476837158, "rewards/margins": 8.888671875, "rewards/rejected": -7.621874809265137, "step": 1045 }, { "epoch": 0.7352941176470589, "grad_norm": 6.501437187194824, "learning_rate": 1.997165767193801e-07, "logits/chosen": -3.8714842796325684, "logits/rejected": -3.5679688453674316, "logps/chosen": -291.0874938964844, "logps/rejected": -349.07501220703125, "loss": 0.0678, "rewards/accuracies": 0.96875, "rewards/chosen": 1.906518578529358, "rewards/margins": 9.068750381469727, "rewards/rejected": -7.152929782867432, "step": 1050 }, { "epoch": 0.738795518207283, "grad_norm": 11.565328598022461, "learning_rate": 1.9485209559609145e-07, "logits/chosen": -3.869140625, "logits/rejected": -3.5992188453674316, "logps/chosen": -321.7124938964844, "logps/rejected": -376.25, "loss": 0.0903, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.876354992389679, "rewards/margins": 8.184374809265137, "rewards/rejected": -7.3046875, "step": 1055 }, { "epoch": 0.742296918767507, "grad_norm": 23.18073844909668, "learning_rate": 1.9003321170512726e-07, "logits/chosen": -3.868359327316284, "logits/rejected": -3.578125, "logps/chosen": -319.9750061035156, "logps/rejected": -368.95001220703125, "loss": 0.1726, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9228760004043579, "rewards/margins": 8.095703125, "rewards/rejected": -7.174609184265137, "step": 1060 }, { "epoch": 0.7457983193277311, "grad_norm": 8.127881050109863, "learning_rate": 1.8526064511621452e-07, "logits/chosen": -3.8570313453674316, "logits/rejected": -3.584765672683716, "logps/chosen": -304.26251220703125, "logps/rejected": -375.3999938964844, "loss": 0.0682, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.971264660358429, "rewards/margins": 8.897656440734863, "rewards/rejected": -7.934179782867432, "step": 1065 }, { "epoch": 0.7492997198879552, "grad_norm": 2.701737880706787, "learning_rate": 1.8053510897804103e-07, "logits/chosen": -3.811328172683716, "logits/rejected": -3.5132813453674316, "logps/chosen": -292.0249938964844, "logps/rejected": -358.1499938964844, "loss": 0.0499, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 1.67572021484375, "rewards/margins": 9.517578125, "rewards/rejected": -7.842577934265137, "step": 1070 }, { "epoch": 0.7528011204481793, "grad_norm": 0.31558746099472046, "learning_rate": 1.7585730941169101e-07, "logits/chosen": -3.7890625, "logits/rejected": -3.532031297683716, "logps/chosen": -290.4750061035156, "logps/rejected": -358.0, "loss": 0.0462, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.8272033929824829, "rewards/margins": 8.696874618530273, "rewards/rejected": -7.870898246765137, "step": 1075 }, { "epoch": 0.7563025210084033, "grad_norm": 15.150861740112305, "learning_rate": 1.7122794540513264e-07, "logits/chosen": -3.811718702316284, "logits/rejected": -3.522656202316284, "logps/chosen": -306.1499938964844, "logps/rejected": -368.875, "loss": 0.0674, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4841064512729645, "rewards/margins": 9.037500381469727, "rewards/rejected": -8.553125381469727, "step": 1080 }, { "epoch": 0.7598039215686274, "grad_norm": 5.7700300216674805, "learning_rate": 1.6664770870876937e-07, "logits/chosen": -3.821484327316284, "logits/rejected": -3.4632811546325684, "logps/chosen": -322.67498779296875, "logps/rejected": -388.625, "loss": 0.0482, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8797363042831421, "rewards/margins": 9.785937309265137, "rewards/rejected": -8.902783393859863, "step": 1085 }, { "epoch": 0.7633053221288515, "grad_norm": 10.431517601013184, "learning_rate": 1.621172837320754e-07, "logits/chosen": -3.78125, "logits/rejected": -3.494921922683716, "logps/chosen": -304.45001220703125, "logps/rejected": -369.67498779296875, "loss": 0.1005, "rewards/accuracies": 0.96875, "rewards/chosen": 0.6063598394393921, "rewards/margins": 9.159765243530273, "rewards/rejected": -8.555468559265137, "step": 1090 }, { "epoch": 0.7668067226890757, "grad_norm": 1.8601133823394775, "learning_rate": 1.5763734744132583e-07, "logits/chosen": -3.7445311546325684, "logits/rejected": -3.474609375, "logps/chosen": -312.1187438964844, "logps/rejected": -380.4750061035156, "loss": 0.1414, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4251464903354645, "rewards/margins": 8.892382621765137, "rewards/rejected": -8.470703125, "step": 1095 }, { "epoch": 0.7703081232492998, "grad_norm": 3.562940835952759, "learning_rate": 1.5320856925843995e-07, "logits/chosen": -3.7476563453674316, "logits/rejected": -3.4683594703674316, "logps/chosen": -313.1875, "logps/rejected": -368.2250061035156, "loss": 0.1008, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.0226318836212158, "rewards/margins": 9.282031059265137, "rewards/rejected": -8.258398056030273, "step": 1100 }, { "epoch": 0.7738095238095238, "grad_norm": 17.6439208984375, "learning_rate": 1.4883161096095187e-07, "logits/chosen": -3.764843702316284, "logits/rejected": -3.450390577316284, "logps/chosen": -313.8374938964844, "logps/rejected": -387.875, "loss": 0.0673, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.898022472858429, "rewards/margins": 9.743359565734863, "rewards/rejected": -8.846094131469727, "step": 1105 }, { "epoch": 0.7773109243697479, "grad_norm": 0.7597159147262573, "learning_rate": 1.4450712658312352e-07, "logits/chosen": -3.704296827316284, "logits/rejected": -3.463671922683716, "logps/chosen": -313.67498779296875, "logps/rejected": -374.79998779296875, "loss": 0.0523, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.42988282442092896, "rewards/margins": 8.662500381469727, "rewards/rejected": -9.093358993530273, "step": 1110 }, { "epoch": 0.780812324929972, "grad_norm": 9.668035507202148, "learning_rate": 1.402357623182136e-07, "logits/chosen": -3.7015624046325684, "logits/rejected": -3.457812547683716, "logps/chosen": -331.70001220703125, "logps/rejected": -381.2749938964844, "loss": 0.1341, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.75238037109375, "rewards/margins": 8.151562690734863, "rewards/rejected": -8.9033203125, "step": 1115 }, { "epoch": 0.7843137254901961, "grad_norm": 4.6653594970703125, "learning_rate": 1.3601815642192038e-07, "logits/chosen": -3.733593702316284, "logits/rejected": -3.3960938453674316, "logps/chosen": -368.875, "logps/rejected": -434.7749938964844, "loss": 0.0877, "rewards/accuracies": 0.96875, "rewards/chosen": -0.5340576171875, "rewards/margins": 9.623437881469727, "rewards/rejected": -10.157812118530273, "step": 1120 }, { "epoch": 0.7878151260504201, "grad_norm": 6.899483680725098, "learning_rate": 1.3185493911700852e-07, "logits/chosen": -3.666015625, "logits/rejected": -3.408203125, "logps/chosen": -314.8125, "logps/rejected": -382.1000061035156, "loss": 0.1256, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.03854980319738388, "rewards/margins": 9.040624618530273, "rewards/rejected": -9.077733993530273, "step": 1125 }, { "epoch": 0.7913165266106442, "grad_norm": 6.99959659576416, "learning_rate": 1.2774673249913652e-07, "logits/chosen": -3.6636719703674316, "logits/rejected": -3.4175782203674316, "logps/chosen": -327.48748779296875, "logps/rejected": -384.9750061035156, "loss": 0.1503, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.278564453125, "rewards/margins": 8.178906440734863, "rewards/rejected": -8.453125, "step": 1130 }, { "epoch": 0.7948179271708683, "grad_norm": 20.7562255859375, "learning_rate": 1.2369415044390052e-07, "logits/chosen": -3.712109327316284, "logits/rejected": -3.444531202316284, "logps/chosen": -311.625, "logps/rejected": -366.32501220703125, "loss": 0.1526, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.01655273512005806, "rewards/margins": 8.888280868530273, "rewards/rejected": -8.871874809265137, "step": 1135 }, { "epoch": 0.7983193277310925, "grad_norm": 7.541940689086914, "learning_rate": 1.1969779851510358e-07, "logits/chosen": -3.732421875, "logits/rejected": -3.474609375, "logps/chosen": -328.7749938964844, "logps/rejected": -392.95001220703125, "loss": 0.0885, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.560778796672821, "rewards/margins": 8.80078125, "rewards/rejected": -8.240625381469727, "step": 1140 }, { "epoch": 0.8018207282913166, "grad_norm": 2.2544028759002686, "learning_rate": 1.1575827387426845e-07, "logits/chosen": -3.7105469703674316, "logits/rejected": -3.483203172683716, "logps/chosen": -296.51251220703125, "logps/rejected": -351.07501220703125, "loss": 0.0749, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.36418455839157104, "rewards/margins": 9.017578125, "rewards/rejected": -8.65625, "step": 1145 }, { "epoch": 0.8053221288515406, "grad_norm": 3.940847873687744, "learning_rate": 1.1187616519140646e-07, "logits/chosen": -3.719921827316284, "logits/rejected": -3.448437452316284, "logps/chosen": -311.1875, "logps/rejected": -391.17498779296875, "loss": 0.0505, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2625488340854645, "rewards/margins": 9.794530868530273, "rewards/rejected": -9.527734756469727, "step": 1150 }, { "epoch": 0.8088235294117647, "grad_norm": 1.4131155014038086, "learning_rate": 1.0805205255705402e-07, "logits/chosen": -3.710156202316284, "logits/rejected": -3.4437499046325684, "logps/chosen": -314.32501220703125, "logps/rejected": -379.8500061035156, "loss": 0.1, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4536376893520355, "rewards/margins": 9.346484184265137, "rewards/rejected": -8.892969131469727, "step": 1155 }, { "epoch": 0.8123249299719888, "grad_norm": 1.5823792219161987, "learning_rate": 1.0428650739559136e-07, "logits/chosen": -3.708203077316284, "logits/rejected": -3.455078125, "logps/chosen": -335.92498779296875, "logps/rejected": -400.67498779296875, "loss": 0.0985, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.008251952938735485, "rewards/margins": 9.153515815734863, "rewards/rejected": -9.148046493530273, "step": 1160 }, { "epoch": 0.8158263305322129, "grad_norm": 2.751337766647339, "learning_rate": 1.005800923798572e-07, "logits/chosen": -3.744140625, "logits/rejected": -3.426953077316284, "logps/chosen": -336.29998779296875, "logps/rejected": -408.32501220703125, "loss": 0.0771, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4516967833042145, "rewards/margins": 9.059374809265137, "rewards/rejected": -8.602343559265137, "step": 1165 }, { "epoch": 0.819327731092437, "grad_norm": 5.285534858703613, "learning_rate": 9.693336134706987e-08, "logits/chosen": -3.731640577316284, "logits/rejected": -3.482421875, "logps/chosen": -306.42498779296875, "logps/rejected": -367.1875, "loss": 0.0841, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2859741151332855, "rewards/margins": 8.418749809265137, "rewards/rejected": -8.137499809265137, "step": 1170 }, { "epoch": 0.822829131652661, "grad_norm": 3.021791934967041, "learning_rate": 9.334685921606944e-08, "logits/chosen": -3.725781202316284, "logits/rejected": -3.463671922683716, "logps/chosen": -314.86248779296875, "logps/rejected": -374.95001220703125, "loss": 0.0813, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.4312988221645355, "rewards/margins": 8.670312881469727, "rewards/rejected": -8.241796493530273, "step": 1175 }, { "epoch": 0.8263305322128851, "grad_norm": 4.565569877624512, "learning_rate": 8.982112190589236e-08, "logits/chosen": -3.7367186546325684, "logits/rejected": -3.482421875, "logps/chosen": -335.38751220703125, "logps/rejected": -400.5249938964844, "loss": 0.0808, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.4556640684604645, "rewards/margins": 9.122655868530273, "rewards/rejected": -8.664648056030273, "step": 1180 }, { "epoch": 0.8298319327731093, "grad_norm": 11.402658462524414, "learning_rate": 8.635667625569099e-08, "logits/chosen": -3.740234375, "logits/rejected": -3.428906202316284, "logps/chosen": -341.1499938964844, "logps/rejected": -405.70001220703125, "loss": 0.1468, "rewards/accuracies": 0.96875, "rewards/chosen": 0.505859375, "rewards/margins": 8.94140625, "rewards/rejected": -8.4326171875, "step": 1185 }, { "epoch": 0.8333333333333334, "grad_norm": 4.893910884857178, "learning_rate": 8.29540399460092e-08, "logits/chosen": -3.73828125, "logits/rejected": -3.440234422683716, "logps/chosen": -322.57501220703125, "logps/rejected": -388.2749938964844, "loss": 0.0675, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.34471434354782104, "rewards/margins": 9.078516006469727, "rewards/rejected": -9.425390243530273, "step": 1190 }, { "epoch": 0.8368347338935574, "grad_norm": 3.020184278488159, "learning_rate": 7.961372142142775e-08, "logits/chosen": -3.7210936546325684, "logits/rejected": -3.4769530296325684, "logps/chosen": -288.4624938964844, "logps/rejected": -358.875, "loss": 0.1032, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.40043944120407104, "rewards/margins": 9.099609375, "rewards/rejected": -9.497655868530273, "step": 1195 }, { "epoch": 0.8403361344537815, "grad_norm": 9.903548240661621, "learning_rate": 7.633621981458915e-08, "logits/chosen": -3.751171827316284, "logits/rejected": -3.470703125, "logps/chosen": -328.67498779296875, "logps/rejected": -400.6499938964844, "loss": 0.0628, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.7047363519668579, "rewards/margins": 9.202539443969727, "rewards/rejected": -9.906641006469727, "step": 1200 }, { "epoch": 0.8438375350140056, "grad_norm": 2.419182300567627, "learning_rate": 7.312202487161317e-08, "logits/chosen": -3.7496094703674316, "logits/rejected": -3.477343797683716, "logps/chosen": -319.375, "logps/rejected": -385.17498779296875, "loss": 0.0502, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.12807616591453552, "rewards/margins": 9.272656440734863, "rewards/rejected": -9.147363662719727, "step": 1205 }, { "epoch": 0.8473389355742297, "grad_norm": 8.510985374450684, "learning_rate": 6.997161687891634e-08, "logits/chosen": -3.705078125, "logits/rejected": -3.462890625, "logps/chosen": -292.8374938964844, "logps/rejected": -358.4125061035156, "loss": 0.0975, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.4731079041957855, "rewards/margins": 9.19921875, "rewards/rejected": -8.723437309265137, "step": 1210 }, { "epoch": 0.8508403361344538, "grad_norm": 5.150385856628418, "learning_rate": 6.688546659144478e-08, "logits/chosen": -3.7718749046325684, "logits/rejected": -3.48828125, "logps/chosen": -313.0625, "logps/rejected": -386.0249938964844, "loss": 0.0476, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.7706543207168579, "rewards/margins": 9.772656440734863, "rewards/rejected": -8.990625381469727, "step": 1215 }, { "epoch": 0.8543417366946778, "grad_norm": 8.4049711227417, "learning_rate": 6.386403516232946e-08, "logits/chosen": -3.7289061546325684, "logits/rejected": -3.469921827316284, "logps/chosen": -324.8999938964844, "logps/rejected": -391.75, "loss": 0.0917, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.3090636730194092, "rewards/margins": 8.778124809265137, "rewards/rejected": -10.088281631469727, "step": 1220 }, { "epoch": 0.8578431372549019, "grad_norm": 53.30424499511719, "learning_rate": 6.090777407397902e-08, "logits/chosen": -3.7113280296325684, "logits/rejected": -3.458984375, "logps/chosen": -315.6875, "logps/rejected": -378.07501220703125, "loss": 0.1183, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.10551147162914276, "rewards/margins": 9.064648628234863, "rewards/rejected": -8.956250190734863, "step": 1225 }, { "epoch": 0.8613445378151261, "grad_norm": 12.400679588317871, "learning_rate": 5.801712507061563e-08, "logits/chosen": -3.72265625, "logits/rejected": -3.459765672683716, "logps/chosen": -300.88751220703125, "logps/rejected": -367.8500061035156, "loss": 0.0682, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.11720886081457138, "rewards/margins": 8.788281440734863, "rewards/rejected": -8.904687881469727, "step": 1230 }, { "epoch": 0.8648459383753502, "grad_norm": 2.6440250873565674, "learning_rate": 5.519252009226638e-08, "logits/chosen": -3.7691407203674316, "logits/rejected": -3.46875, "logps/chosen": -336.29998779296875, "logps/rejected": -396.2749938964844, "loss": 0.0641, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.09663085639476776, "rewards/margins": 9.427343368530273, "rewards/rejected": -9.334375381469727, "step": 1235 }, { "epoch": 0.8683473389355743, "grad_norm": 9.605412483215332, "learning_rate": 5.243438121022076e-08, "logits/chosen": -3.72265625, "logits/rejected": -3.462890625, "logps/chosen": -330.0062561035156, "logps/rejected": -404.17498779296875, "loss": 0.095, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.3220153748989105, "rewards/margins": 9.167187690734863, "rewards/rejected": -9.490234375, "step": 1240 }, { "epoch": 0.8718487394957983, "grad_norm": 7.742379188537598, "learning_rate": 4.974312056396113e-08, "logits/chosen": -3.69921875, "logits/rejected": -3.451953172683716, "logps/chosen": -312.7437438964844, "logps/rejected": -382.5625, "loss": 0.0875, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.741992175579071, "rewards/margins": 9.014843940734863, "rewards/rejected": -9.757031440734863, "step": 1245 }, { "epoch": 0.8753501400560224, "grad_norm": 2.646317720413208, "learning_rate": 4.711914029957842e-08, "logits/chosen": -3.721874952316284, "logits/rejected": -3.4429688453674316, "logps/chosen": -329.1000061035156, "logps/rejected": -392.3500061035156, "loss": 0.137, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.7165282964706421, "rewards/margins": 9.112890243530273, "rewards/rejected": -9.827343940734863, "step": 1250 }, { "epoch": 0.8788515406162465, "grad_norm": 7.2253499031066895, "learning_rate": 4.456283250968096e-08, "logits/chosen": -3.707812547683716, "logits/rejected": -3.44140625, "logps/chosen": -304.82501220703125, "logps/rejected": -373.38751220703125, "loss": 0.0425, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.49871522188186646, "rewards/margins": 9.774999618530273, "rewards/rejected": -9.2763671875, "step": 1255 }, { "epoch": 0.8823529411764706, "grad_norm": 2.6227643489837646, "learning_rate": 4.2074579174805167e-08, "logits/chosen": -3.7425780296325684, "logits/rejected": -3.4820313453674316, "logps/chosen": -293.14373779296875, "logps/rejected": -363.45001220703125, "loss": 0.0338, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.5131057500839233, "rewards/margins": 9.817968368530273, "rewards/rejected": -9.3017578125, "step": 1260 }, { "epoch": 0.8858543417366946, "grad_norm": 1.3551222085952759, "learning_rate": 3.965475210633717e-08, "logits/chosen": -3.721874952316284, "logits/rejected": -3.423046827316284, "logps/chosen": -343.7250061035156, "logps/rejected": -410.67498779296875, "loss": 0.1, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5851074457168579, "rewards/margins": 9.858593940734863, "rewards/rejected": -10.449999809265137, "step": 1265 }, { "epoch": 0.8893557422969187, "grad_norm": 8.488232612609863, "learning_rate": 3.7303712890955075e-08, "logits/chosen": -3.7113280296325684, "logits/rejected": -3.458203077316284, "logps/chosen": -322.875, "logps/rejected": -390.95001220703125, "loss": 0.1156, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.894335925579071, "rewards/margins": 9.289843559265137, "rewards/rejected": -10.184374809265137, "step": 1270 }, { "epoch": 0.8928571428571429, "grad_norm": 4.776467800140381, "learning_rate": 3.5021812836597555e-08, "logits/chosen": -3.710156202316284, "logits/rejected": -3.4351563453674316, "logps/chosen": -322.61248779296875, "logps/rejected": -382.51251220703125, "loss": 0.0723, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9281371831893921, "rewards/margins": 9.189844131469727, "rewards/rejected": -10.128515243530273, "step": 1275 }, { "epoch": 0.896358543417367, "grad_norm": 14.904967308044434, "learning_rate": 3.2809392919969484e-08, "logits/chosen": -3.70703125, "logits/rejected": -3.430859327316284, "logps/chosen": -324.42498779296875, "logps/rejected": -399.8999938964844, "loss": 0.1605, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.23081055283546448, "rewards/margins": 9.43359375, "rewards/rejected": -9.666406631469727, "step": 1280 }, { "epoch": 0.8998599439775911, "grad_norm": 9.847896575927734, "learning_rate": 3.0666783735590615e-08, "logits/chosen": -3.75, "logits/rejected": -3.4605469703674316, "logps/chosen": -315.07501220703125, "logps/rejected": -394.17498779296875, "loss": 0.0653, "rewards/accuracies": 0.96875, "rewards/chosen": 0.1302490234375, "rewards/margins": 10.15234375, "rewards/rejected": -10.021875381469727, "step": 1285 }, { "epoch": 0.9033613445378151, "grad_norm": 7.819700241088867, "learning_rate": 2.859430544639624e-08, "logits/chosen": -3.7027344703674316, "logits/rejected": -3.426953077316284, "logps/chosen": -342.17498779296875, "logps/rejected": -412.9750061035156, "loss": 0.0678, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7393798828125, "rewards/margins": 9.6171875, "rewards/rejected": -10.364062309265137, "step": 1290 }, { "epoch": 0.9068627450980392, "grad_norm": 4.363787651062012, "learning_rate": 2.6592267735896067e-08, "logits/chosen": -3.690234422683716, "logits/rejected": -3.446093797683716, "logps/chosen": -334.76251220703125, "logps/rejected": -398.6499938964844, "loss": 0.0916, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8033202886581421, "rewards/margins": 8.637890815734863, "rewards/rejected": -9.435937881469727, "step": 1295 }, { "epoch": 0.9103641456582633, "grad_norm": 2.6290194988250732, "learning_rate": 2.4660969761899576e-08, "logits/chosen": -3.700390577316284, "logits/rejected": -3.458203077316284, "logps/chosen": -301.0249938964844, "logps/rejected": -365.0, "loss": 0.164, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.2901855409145355, "rewards/margins": 8.687891006469727, "rewards/rejected": -8.9794921875, "step": 1300 }, { "epoch": 0.9138655462184874, "grad_norm": 1.2787213325500488, "learning_rate": 2.2800700111813455e-08, "logits/chosen": -3.696484327316284, "logits/rejected": -3.4273438453674316, "logps/chosen": -322.29998779296875, "logps/rejected": -381.2250061035156, "loss": 0.0519, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4840148985385895, "rewards/margins": 9.659375190734863, "rewards/rejected": -9.176953315734863, "step": 1305 }, { "epoch": 0.9173669467787114, "grad_norm": 83.05997467041016, "learning_rate": 2.101173675951928e-08, "logits/chosen": -3.6820311546325684, "logits/rejected": -3.4390625953674316, "logps/chosen": -308.6812438964844, "logps/rejected": -380.0, "loss": 0.101, "rewards/accuracies": 0.96875, "rewards/chosen": -0.42204588651657104, "rewards/margins": 9.311718940734863, "rewards/rejected": -9.731249809265137, "step": 1310 }, { "epoch": 0.9208683473389355, "grad_norm": 2.9914722442626953, "learning_rate": 1.9294347023836475e-08, "logits/chosen": -3.720703125, "logits/rejected": -3.44921875, "logps/chosen": -323.2250061035156, "logps/rejected": -402.42498779296875, "loss": 0.0593, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.349386602640152, "rewards/margins": 9.157812118530273, "rewards/rejected": -9.510156631469727, "step": 1315 }, { "epoch": 0.9243697478991597, "grad_norm": 3.304603338241577, "learning_rate": 1.7648787528578126e-08, "logits/chosen": -3.708984375, "logits/rejected": -3.4117188453674316, "logps/chosen": -324.1875, "logps/rejected": -398.29998779296875, "loss": 0.0767, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.3753418028354645, "rewards/margins": 9.782031059265137, "rewards/rejected": -10.1640625, "step": 1320 }, { "epoch": 0.9278711484593838, "grad_norm": 2.7856390476226807, "learning_rate": 1.6075304164204385e-08, "logits/chosen": -3.7066407203674316, "logits/rejected": -3.455859422683716, "logps/chosen": -317.7562561035156, "logps/rejected": -396.13751220703125, "loss": 0.0949, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7166748046875, "rewards/margins": 8.772656440734863, "rewards/rejected": -9.496874809265137, "step": 1325 }, { "epoch": 0.9313725490196079, "grad_norm": 6.607837200164795, "learning_rate": 1.4574132051079658e-08, "logits/chosen": -3.723828077316284, "logits/rejected": -3.4527344703674316, "logps/chosen": -336.09375, "logps/rejected": -404.6000061035156, "loss": 0.0862, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.12067870795726776, "rewards/margins": 9.612500190734863, "rewards/rejected": -9.724218368530273, "step": 1330 }, { "epoch": 0.9348739495798319, "grad_norm": 5.29033088684082, "learning_rate": 1.3145495504339855e-08, "logits/chosen": -3.685546875, "logits/rejected": -3.4644532203674316, "logps/chosen": -315.57501220703125, "logps/rejected": -370.45001220703125, "loss": 0.1172, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.13958740234375, "rewards/margins": 8.61328125, "rewards/rejected": -8.471094131469727, "step": 1335 }, { "epoch": 0.938375350140056, "grad_norm": 8.475128173828125, "learning_rate": 1.1789608000373208e-08, "logits/chosen": -3.7242188453674316, "logits/rejected": -3.438281297683716, "logps/chosen": -313.14373779296875, "logps/rejected": -389.1000061035156, "loss": 0.0504, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.16085204482078552, "rewards/margins": 9.942187309265137, "rewards/rejected": -10.09765625, "step": 1340 }, { "epoch": 0.9418767507002801, "grad_norm": 7.167829990386963, "learning_rate": 1.0506672144921513e-08, "logits/chosen": -3.749218702316284, "logits/rejected": -3.4898438453674316, "logps/chosen": -304.2875061035156, "logps/rejected": -383.45001220703125, "loss": 0.0727, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.04436035081744194, "rewards/margins": 9.248437881469727, "rewards/rejected": -9.2109375, "step": 1345 }, { "epoch": 0.9453781512605042, "grad_norm": 3.9453182220458984, "learning_rate": 9.296879642805288e-09, "logits/chosen": -3.69140625, "logits/rejected": -3.4085936546325684, "logps/chosen": -304.625, "logps/rejected": -367.1625061035156, "loss": 0.0397, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.43353271484375, "rewards/margins": 9.653905868530273, "rewards/rejected": -10.091796875, "step": 1350 }, { "epoch": 0.9488795518207283, "grad_norm": 9.613517761230469, "learning_rate": 8.160411269278077e-09, "logits/chosen": -3.7249999046325684, "logits/rejected": -3.4898438453674316, "logps/chosen": -306.33123779296875, "logps/rejected": -374.20001220703125, "loss": 0.0415, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.807202160358429, "rewards/margins": 9.182812690734863, "rewards/rejected": -9.989062309265137, "step": 1355 }, { "epoch": 0.9523809523809523, "grad_norm": 5.10851526260376, "learning_rate": 7.097436843013782e-09, "logits/chosen": -3.7222657203674316, "logits/rejected": -3.462890625, "logps/chosen": -296.70001220703125, "logps/rejected": -363.45001220703125, "loss": 0.056, "rewards/accuracies": 0.96875, "rewards/chosen": -0.08205566555261612, "rewards/margins": 8.882031440734863, "rewards/rejected": -8.967187881469727, "step": 1360 }, { "epoch": 0.9558823529411765, "grad_norm": 2.2889809608459473, "learning_rate": 6.1081152007310675e-09, "logits/chosen": -3.716015577316284, "logits/rejected": -3.470703125, "logps/chosen": -325.7250061035156, "logps/rejected": -377.2250061035156, "loss": 0.1124, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.568286120891571, "rewards/margins": 9.180078506469727, "rewards/rejected": -9.743359565734863, "step": 1365 }, { "epoch": 0.9593837535014006, "grad_norm": 62.453704833984375, "learning_rate": 5.192594173459242e-09, "logits/chosen": -3.706249952316284, "logits/rejected": -3.4453125, "logps/chosen": -325.83123779296875, "logps/rejected": -394.23748779296875, "loss": 0.1212, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4400390684604645, "rewards/margins": 8.687891006469727, "rewards/rejected": -9.122949600219727, "step": 1370 }, { "epoch": 0.9628851540616247, "grad_norm": 1.9323933124542236, "learning_rate": 4.351010564447976e-09, "logits/chosen": -3.7621092796325684, "logits/rejected": -3.457812547683716, "logps/chosen": -333.2250061035156, "logps/rejected": -392.4750061035156, "loss": 0.0452, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.42540282011032104, "rewards/margins": 9.553125381469727, "rewards/rejected": -9.125781059265137, "step": 1375 }, { "epoch": 0.9663865546218487, "grad_norm": 3.083484411239624, "learning_rate": 3.5834901287255524e-09, "logits/chosen": -3.705859422683716, "logits/rejected": -3.457812547683716, "logps/chosen": -300.53125, "logps/rejected": -367.7250061035156, "loss": 0.0819, "rewards/accuracies": 0.96875, "rewards/chosen": 0.20588378608226776, "rewards/margins": 9.201952934265137, "rewards/rejected": -9.0, "step": 1380 }, { "epoch": 0.9698879551820728, "grad_norm": 7.1853437423706055, "learning_rate": 2.8901475543076647e-09, "logits/chosen": -3.735156297683716, "logits/rejected": -3.4710936546325684, "logps/chosen": -304.48748779296875, "logps/rejected": -368.8999938964844, "loss": 0.1296, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.914019763469696, "rewards/margins": 9.189453125, "rewards/rejected": -10.100000381469727, "step": 1385 }, { "epoch": 0.9733893557422969, "grad_norm": 11.565399169921875, "learning_rate": 2.2710864450596336e-09, "logits/chosen": -3.682812452316284, "logits/rejected": -3.4730467796325684, "logps/chosen": -283.4624938964844, "logps/rejected": -344.75, "loss": 0.0776, "rewards/accuracies": 0.96875, "rewards/chosen": -0.631591796875, "rewards/margins": 8.482812881469727, "rewards/rejected": -9.115625381469727, "step": 1390 }, { "epoch": 0.976890756302521, "grad_norm": 38.52793502807617, "learning_rate": 1.7263993052157867e-09, "logits/chosen": -3.762890577316284, "logits/rejected": -3.452343702316284, "logps/chosen": -338.5, "logps/rejected": -406.45001220703125, "loss": 0.0462, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.16367188096046448, "rewards/margins": 9.822656631469727, "rewards/rejected": -9.652929306030273, "step": 1395 }, { "epoch": 0.9803921568627451, "grad_norm": 14.1930570602417, "learning_rate": 1.2561675255564618e-09, "logits/chosen": -3.7249999046325684, "logits/rejected": -3.4789061546325684, "logps/chosen": -318.2875061035156, "logps/rejected": -389.04998779296875, "loss": 0.1392, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.48853760957717896, "rewards/margins": 8.848437309265137, "rewards/rejected": -9.339062690734863, "step": 1400 }, { "epoch": 0.9838935574229691, "grad_norm": 5.662397861480713, "learning_rate": 8.60461371246235e-10, "logits/chosen": -3.7425780296325684, "logits/rejected": -3.438671827316284, "logps/chosen": -344.48748779296875, "logps/rejected": -407.6000061035156, "loss": 0.1257, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.18222656846046448, "rewards/margins": 9.806249618530273, "rewards/rejected": -9.988672256469727, "step": 1405 }, { "epoch": 0.9873949579831933, "grad_norm": 2.6685423851013184, "learning_rate": 5.393399713341517e-10, "logits/chosen": -3.7054686546325684, "logits/rejected": -3.4292969703674316, "logps/chosen": -313.25, "logps/rejected": -392.75, "loss": 0.0421, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.519946277141571, "rewards/margins": 9.720312118530273, "rewards/rejected": -10.240625381469727, "step": 1410 }, { "epoch": 0.9908963585434174, "grad_norm": 5.167869567871094, "learning_rate": 2.928513099187402e-10, "logits/chosen": -3.7132811546325684, "logits/rejected": -3.440624952316284, "logps/chosen": -312.0, "logps/rejected": -371.2250061035156, "loss": 0.078, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.597119152545929, "rewards/margins": 9.114843368530273, "rewards/rejected": -8.518359184265137, "step": 1415 }, { "epoch": 0.9943977591036415, "grad_norm": 5.322803020477295, "learning_rate": 1.2103221897746818e-10, "logits/chosen": -3.6968750953674316, "logits/rejected": -3.438671827316284, "logps/chosen": -316.2250061035156, "logps/rejected": -386.54998779296875, "loss": 0.0977, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.326605200767517, "rewards/margins": 9.290234565734863, "rewards/rejected": -10.621484756469727, "step": 1420 }, { "epoch": 0.9978991596638656, "grad_norm": 3.4535648822784424, "learning_rate": 2.3908372863368222e-11, "logits/chosen": -3.70703125, "logits/rejected": -3.448046922683716, "logps/chosen": -308.6000061035156, "logps/rejected": -380.0249938964844, "loss": 0.0616, "rewards/accuracies": 0.96875, "rewards/chosen": -0.05886230617761612, "rewards/margins": 9.678125381469727, "rewards/rejected": -9.73828125, "step": 1425 }, { "epoch": 1.0, "step": 1428, "total_flos": 0.0, "train_loss": 0.16149422216962198, "train_runtime": 9781.4907, "train_samples_per_second": 4.672, "train_steps_per_second": 0.146 } ], "logging_steps": 5, "max_steps": 1428, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }